47 files changed, 1670 insertions, 1124 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac6b27abb1ad..642d4277c2ea 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-            hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+            hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/audit.c b/kernel/audit.c
index 4e9d20829681..d13276d41410 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
                        err = -EPERM;
                break;
        case AUDIT_USER:
-        case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
-        case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
                if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                                        loginuid, sid);
                break;
        case AUDIT_USER:
-        case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
-        case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
                        return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d240349cbf0f..88b416dfbc72 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/string.h>
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
                return -EACCES;
        trialcs = *cs;
-        retval = cpulist_parse(buf, trialcs.cpus_allowed);
-        if (retval < 0)
+        /*
-                return retval;
+         * We allow a cpuset's cpus_allowed to be empty; if it has attached
+         * tasks, we'll catch it later when we validate the change and return
+         * -ENOSPC.
+         */
+        if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+                cpus_clear(trialcs.cpus_allowed);
+        } else {
+                retval = cpulist_parse(buf, trialcs.cpus_allowed);
+                if (retval < 0)
+                        return retval;
+        }
        cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
-        if (cpus_empty(trialcs.cpus_allowed))
+        /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
+        if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
                return -ENOSPC;
        retval = validate_change(cs, &trialcs);
        if (retval < 0)
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                return -EACCES;
        trialcs = *cs;
-        retval = nodelist_parse(buf, trialcs.mems_allowed);
-        if (retval < 0)
+        /*
-                goto done;
+         * We allow a cpuset's mems_allowed to be empty; if it has attached
+         * tasks, we'll catch it later when we validate the change and return
+         * -ENOSPC.
+         */
+        if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+                nodes_clear(trialcs.mems_allowed);
+        } else {
+                retval = nodelist_parse(buf, trialcs.mems_allowed);
+                if (retval < 0)
+                        goto done;
+        }
        nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
        oldmem = cs->mems_allowed;
        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
-        if (nodes_empty(trialcs.mems_allowed)) {
+        /* mems_allowed cannot be empty for a cpuset with attached tasks. */
+        if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
                retval = -ENOSPC;
                goto done;
        }
@@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child)
 * it is holding that mutex while calling check_for_release(),
 * which calls kmalloc(), so can't be called holding callback_mutex().
 *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
 * the_top_cpuset_hack:
 *
 *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk)
 {
        struct cpuset *cs;
+        task_lock(current);
        cs = tsk->cpuset;
        tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above */
+        task_unlock(current);
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
new file mode 100644
index 000000000000..0d98827887a7
--- /dev/null
+++ b/kernel/die_notifier.c
@@ -0,0 +1,38 @@
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/kdebug.h>
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+int notify_die(enum die_val val, const char *str,
+               struct pt_regs *regs, long err, int trap, int sig)
+{
+        struct die_args args = {
+                .regs           = regs,
+                .str            = str,
+                .err            = err,
+                .trapnr         = trap,
+                .signr          = sig,
+        };
+        return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+int register_die_notifier(struct notifier_block *nb)
+{
+        vmalloc_sync_all();
+        return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+int unregister_die_notifier(struct notifier_block *nb)
+{
+        return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
diff --git a/kernel/exit.c b/kernel/exit.c
index 92369240d91d..f5a7abb621f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,7 +7,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d169def942..a8dd75d4992b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,7 +14,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
@@ -1516,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 /*
- * Unshare the mnt_namespace structure if it is being shared
- */
-static int unshare_mnt_namespace(unsigned long unshare_flags,
-                struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
-{
-        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-        if ((unshare_flags & CLONE_NEWNS) && ns) {
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
-                if (!*new_nsp)
-                        return -ENOMEM;
-        }
-        return 0;
-}
-/*
 * Unsharing of sighand is not supported yet
 */
 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
@@ -1593,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
        return 0;
 }
-#ifndef CONFIG_IPC_NS
-static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
-{
-        if (flags & CLONE_NEWIPC)
-                return -EINVAL;
-        return 0;
-}
-#endif
 /*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
@@ -1615,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 {
        int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-        struct mnt_namespace *ns, *new_ns = NULL;
        struct sighand_struct *new_sigh = NULL;
        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct sem_undo_list *new_ulist = NULL;
        struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
-        struct uts_namespace *uts, *new_uts = NULL;
-        struct ipc_namespace *ipc, *new_ipc = NULL;
        check_unshare_flags(&unshare_flags);
@@ -1637,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
                goto bad_unshare_cleanup_thread;
-        if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
-                goto bad_unshare_cleanup_fs;
        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
-                goto bad_unshare_cleanup_ns;
+                goto bad_unshare_cleanup_fs;
        if ((err = unshare_vm(unshare_flags, &new_mm)))
                goto bad_unshare_cleanup_sigh;
        if ((err = unshare_fd(unshare_flags, &new_fd)))
                goto bad_unshare_cleanup_vm;
        if ((err = unshare_semundo(unshare_flags, &new_ulist)))
                goto bad_unshare_cleanup_fd;
-        if ((err = unshare_utsname(unshare_flags, &new_uts)))
+        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+                        new_fs)))
                goto bad_unshare_cleanup_semundo;
-        if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
-                goto bad_unshare_cleanup_uts;
-        if (new_ns || new_uts || new_ipc) {
-                old_nsproxy = current->nsproxy;
-                new_nsproxy = dup_namespaces(old_nsproxy);
-                if (!new_nsproxy) {
-                        err = -ENOMEM;
-                        goto bad_unshare_cleanup_ipc;
-                }
-        }
-        if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
+        if (new_fs ||  new_mm || new_fd || new_ulist || new_nsproxy) {
-                                new_uts || new_ipc) {
                task_lock(current);
                if (new_nsproxy) {
+                        old_nsproxy = current->nsproxy;
                        current->nsproxy = new_nsproxy;
                        new_nsproxy = old_nsproxy;
                }
@@ -1677,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                        new_fs = fs;
                }
-                if (new_ns) {
-                        ns = current->nsproxy->mnt_ns;
-                        current->nsproxy->mnt_ns = new_ns;
-                        new_ns = ns;
-                }
                if (new_mm) {
                        mm = current->mm;
                        active_mm = current->active_mm;
@@ -1698,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                        new_fd = fd;
                }
-                if (new_uts) {
-                        uts = current->nsproxy->uts_ns;
-                        current->nsproxy->uts_ns = new_uts;
-                        new_uts = uts;
-                }
-                if (new_ipc) {
-                        ipc = current->nsproxy->ipc_ns;
-                        current->nsproxy->ipc_ns = new_ipc;
-                        new_ipc = ipc;
-                }
                task_unlock(current);
        }
        if (new_nsproxy)
                put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_ipc:
-        if (new_ipc)
-                put_ipc_ns(new_ipc);
-bad_unshare_cleanup_uts:
-        if (new_uts)
-                put_uts_ns(new_uts);
 bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
        if (new_fd)
@@ -1738,10 +1666,6 @@ bad_unshare_cleanup_sigh:
                if (atomic_dec_and_test(&new_sigh->count))
                        kmem_cache_free(sighand_cachep, new_sigh);
-bad_unshare_cleanup_ns:
-        if (new_ns)
-                put_mnt_ns(new_ns);
 bad_unshare_cleanup_fs:
        if (new_fs)
                put_fs_struct(new_fs);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a270b5e3f95..600bc9d801f2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,6 +48,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/module.h>
 #include <asm/futex.h>
 #include "rtmutex_common.h"
@@ -55,32 +56,6 @@
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 /*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
-        struct {
-                unsigned long pgoff;
-                struct inode *inode;
-                int offset;
-        } shared;
-        struct {
-                unsigned long address;
-                struct mm_struct *mm;
-                int offset;
-        } private;
-        struct {
-                unsigned long word;
-                void *ptr;
-                int offset;
-        } both;
-};
-/*
 * Priority Inheritance state:
 */
 struct futex_pi_state {
@@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 *
 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
 */
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
        }
        return err;
 }
+EXPORT_SYMBOL_GPL(get_futex_key);
 /*
 * Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
 * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
 */
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
 {
        if (key->both.ptr != 0) {
                if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key)
                        atomic_inc(&key->private.mm->mm_count);
        }
 }
+EXPORT_SYMBOL_GPL(get_futex_key_refs);
 /*
 * Drop a reference to the resource addressed by a key.
 * The hash bucket spinlock must not be held.
 */
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
 {
        if (key->both.ptr != 0) {
                if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key)
                        mmdrop(key->private.mm);
        }
 }
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);
 static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
@@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
                                this->lock_ptr = &hb2->lock;
                        }
                        this->key = key2;
-                        get_key_refs(&key2);
+                        get_futex_key_refs(&key2);
                        drop_count++;
                        if (ret - nr_wake >= nr_requeue)
@@ -886,9 +864,9 @@ out_unlock:
        if (hb1 != hb2)
                spin_unlock(&hb2->lock);
-        /* drop_key_refs() must be called outside the spinlocks. */
+        /* drop_futex_key_refs() must be called outside the spinlocks. */
        while (--drop_count >= 0)
-                drop_key_refs(&key1);
+                drop_futex_key_refs(&key1);
 out:
        up_read(&current->mm->mmap_sem);
@@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
        init_waitqueue_head(&q->waiters);
-        get_key_refs(&q->key);
+        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -925,7 +903,7 @@ static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
        spin_unlock(&hb->lock);
-        drop_key_refs(&q->key);
+        drop_futex_key_refs(&q->key);
 }
 /*
@@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q)
                ret = 1;
        }
-        drop_key_refs(&q->key);
+        drop_futex_key_refs(&q->key);
        return ret;
 }
@@ -999,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
        spin_unlock(&hb->lock);
-        drop_key_refs(&q->key);
+        drop_futex_key_refs(&q->key);
 }
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+static long futex_wait_restart(struct restart_block *restart);
+static int futex_wait_abstime(u32 __user *uaddr, u32 val,
+                        int timed, unsigned long abs_time)
 {
        struct task_struct *curr = current;
        DECLARE_WAITQUEUE(wait, curr);
        struct futex_hash_bucket *hb;
        struct futex_q q;
+        unsigned long time_left = 0;
        u32 uval;
        int ret;
@@ -1087,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
         * !list_empty() is safe here without any lock.
         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
         */
-        if (likely(!list_empty(&q.list)))
+        time_left = 0;
-                time = schedule_timeout(time);
+        if (likely(!list_empty(&q.list))) {
+                unsigned long rel_time;
+                if (timed) {
+                        unsigned long now = jiffies;
+                        if (time_after(now, abs_time))
+                                rel_time = 0;
+                        else
+                                rel_time = abs_time - now;
+                } else
+                        rel_time = MAX_SCHEDULE_TIMEOUT;
+                time_left = schedule_timeout(rel_time);
+        }
        __set_current_state(TASK_RUNNING);
        /*
@@ -1099,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
        /* If we were woken (and unqueued), we succeeded, whatever. */
        if (!unqueue_me(&q))
                return 0;
-        if (time == 0)
+        if (time_left == 0)
                return -ETIMEDOUT;
        /*
         * We expect signal_pending(current), but another thread may
         * have handled it for us already.
         */
-        return -EINTR;
+        if (time_left == MAX_SCHEDULE_TIMEOUT)
+                return -ERESTARTSYS;
+        else {
+                struct restart_block *restart;
+                restart = &current_thread_info()->restart_block;
+                restart->fn = futex_wait_restart;
+                restart->arg0 = (unsigned long)uaddr;
+                restart->arg1 = (unsigned long)val;
+                restart->arg2 = (unsigned long)timed;
+                restart->arg3 = abs_time;
+                return -ERESTART_RESTARTBLOCK;
+        }
 out_unlock_release_sem:
        queue_unlock(&q, hb);
@@ -1115,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
        return ret;
 }
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
+{
+        int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
+        return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
+}
+static long futex_wait_restart(struct restart_block *restart)
+{
+        u32 __user *uaddr = (u32 __user *)restart->arg0;
+        u32 val = (u32)restart->arg1;
+        int timed = (int)restart->arg2;
+        unsigned long abs_time = restart->arg3;
+        restart->fn = do_no_restart_syscall;
+        return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+}
 /*
 * Userspace tried a 0 -> TID atomic transition of the futex value
 * and failed. The kernel side here does the whole locking operation:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1b3033105b40..c9f4f044a8a8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
        return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 /*
 * enqueue_hrtimer - internal function to (re)start a timer
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aff1f0fabb0d..32e1ab1477d1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 *
 * Controller mappings for all interrupt sources:
 */
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS-1] = {
                .status = IRQ_DISABLED,
                .chip = &no_irq_chip,
@@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
                if (desc->chip->ack)
                        desc->chip->ack(irq);
                action_ret = handle_IRQ_event(irq, desc->action);
+                if (!noirqdebug)
+                        note_interrupt(irq, desc, action_ret);
                desc->chip->end(irq);
                return 1;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5597c157442a..203a518b6f14 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        }
        *p = new;
-#if defined(CONFIG_IRQ_PER_CPU)
-        if (new->flags & IRQF_PERCPU)
-                desc->status |= IRQ_PER_CPU;
-#endif
        /* Exclude IRQ from balancing */
        if (new->flags & IRQF_NOBALANCING)
                desc->status |= IRQ_NO_BALANCING;
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        if (!shared) {
                irq_chip_set_defaults(desc->chip);
+#if defined(CONFIG_IRQ_PER_CPU)
+                if (new->flags & IRQF_PERCPU)
+                        desc->status |= IRQ_PER_CPU;
+#endif
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
                        if (desc->chip && desc->chip->set_type)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2db91eb54ad8..ddde0ef9ccdc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
 {
        struct irq_desc *desc = irq_desc + irq;
        struct irqaction *action;
+        unsigned long flags;
+        int ret = 1;
-        for (action = desc->action ; action; action = action->next)
+        spin_lock_irqsave(&desc->lock, flags);
+        for (action = desc->action ; action; action = action->next) {
                if ((action != new_action) && action->name &&
-                                !strcmp(new_action->name, action->name))
+                                !strcmp(new_action->name, action->name)) {
-                        return 0;
+                        ret = 0;
-        return 1;
+                        break;
+                }
+        }
+        spin_unlock_irqrestore(&desc->lock, flags);
+        return ret;
 }
 void register_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9d8c79b48823..b0d81aae472f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        if (unlikely(irqfixup)) {
                /* Don't punish working computers */
-                if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+                if ((irqfixup == 2 && ((irq == 0) ||
+                                (desc->action->flags & IRQF_IRQPOLL))) ||
+                                action_ret == IRQ_NONE) {
                        int ok = misrouted_irq(irq);
                        if (action_ret == IRQ_NONE)
                                desc->irqs_unhandled -= ok;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 307c6a632ef6..3205e8e114fa 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -7,7 +7,6 @@
 /* These are all the functions necessary to implement itimers */
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
@@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 }
 /*
- * We do not care about correctness. We just sanitize the values so
- * the ktime_t operations which expect normalized values do not
- * break. This converts negative values to long timeouts similar to
- * the code in kernel versions < 2.6.16
- *
- * Print a limited number of warning messages when an invalid timeval
- * is detected.
- */
-static void fixup_timeval(struct timeval *tv, int interval)
-{
-        static int warnlimit = 10;
-        unsigned long tmp;
-        if (warnlimit > 0) {
-                warnlimit--;
-                printk(KERN_WARNING
-                       "setitimer: %s (pid = %d) provided "
-                       "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
-                       current->comm, current->pid,
-                       interval ? "it_interval" : "it_value",
-                       tv->tv_sec, (long) tv->tv_usec);
-        }
-        tmp = tv->tv_usec;
-        if (tmp >= USEC_PER_SEC) {
-                tv->tv_usec = tmp % USEC_PER_SEC;
-                tv->tv_sec += tmp / USEC_PER_SEC;
-        }
-        tmp = tv->tv_sec;
-        if (tmp > LONG_MAX)
-                tv->tv_sec = LONG_MAX;
-}
-/*
 * Returns true if the timeval is in canonical form
 */
 #define timeval_valid(t) \
        (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
-/*
- * Check for invalid timevals, sanitize them and print a limited
- * number of warnings.
- */
-static void check_itimerval(struct itimerval *value) {
-        if (unlikely(!timeval_valid(&value->it_value)))
-                fixup_timeval(&value->it_value, 0);
-        if (unlikely(!timeval_valid(&value->it_interval)))
-                fixup_timeval(&value->it_interval, 1);
-}
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
        struct task_struct *tsk = current;
@@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
        /*
         * Validate the timevals in value.
-         *
-         * Note: Although the spec requires that invalid values shall
-         * return -EINVAL, we just fixup the value and print a limited
-         * number of warnings in order not to break users of this
-         * historical misfeature.
-         *
-         * Scheduled for replacement in March 2007
         */
-        check_itimerval(value);
+        if (!timeval_valid(&value->it_value) ||
+            !timeval_valid(&value->it_interval))
+                return -EINVAL;
        switch (which) {
        case ITIMER_REAL:
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5a0de8409739..f1bda23140b2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
                        symbol_end = (unsigned long)_etext;
        }
-        *symbolsize = symbol_end - symbol_start;
+        if (symbolsize)
-        *offset = addr - symbol_start;
+                *symbolsize = symbol_end - symbol_start;
+        if (offset)
+                *offset = addr - symbol_start;
        return low;
 }
@@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr,
        return NULL;
 }
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+        symname[0] = '\0';
+        symname[KSYM_NAME_LEN] = '\0';
+        if (is_ksym_addr(addr)) {
+                unsigned long pos;
+                pos = get_symbol_pos(addr, NULL, NULL);
+                /* Grab name */
+                kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+                return 0;
+        }
+        /* see if it's in a module */
+        return lookup_module_symbol_name(addr, symname);
+}
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+                        unsigned long *offset, char *modname, char *name)
+{
+        name[0] = '\0';
+        name[KSYM_NAME_LEN] = '\0';
+        if (is_ksym_addr(addr)) {
+                unsigned long pos;
+                pos = get_symbol_pos(addr, size, offset);
+                /* Grab name */
+                kallsyms_expand_symbol(get_symbol_offset(pos), name);
+                modname[0] = '\0';
+                return 0;
+        }
+        /* see if it's in a module */
+        return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
 /* Look up a kernel symbol and return it in a text buffer. */
 int sprint_symbol(char *buffer, unsigned long address)
 {
@@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address)
 struct kallsym_iter
 {
        loff_t pos;
-        struct module *owner;
        unsigned long value;
        unsigned int nameoff; /* If iterating in core kernel symbols */
        char type;
        char name[KSYM_NAME_LEN+1];
+        char module_name[MODULE_NAME_LEN + 1];
+        int exported;
 };
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
-        iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
+        if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
-                                         &iter->value, &iter->type,
+                                &iter->type, iter->name, iter->module_name,
-                                         iter->name, sizeof(iter->name));
+                                &iter->exported) < 0)
-        if (iter->owner == NULL)
                return 0;
-        /* Label it "global" if it is exported, "local" if not exported. */
-        iter->type = is_exported(iter->name, iter->owner)
-                ? toupper(iter->type) : tolower(iter->type);
        return 1;
 }
@@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
 {
        unsigned off = iter->nameoff;
-        iter->owner = NULL;
+        iter->module_name[0] = '\0';
        iter->value = kallsyms_addresses[iter->pos];
        iter->type = kallsyms_get_symbol_type(off);
@@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p)
        if (!iter->name[0])
                return 0;
-        if (iter->owner)
+        if (iter->module_name[0]) {
+                char type;
+                /* Label it "global" if it is exported,
+                 * "local" if not exported. */
+                type = iter->exported ? toupper(iter->type) :
+                                        tolower(iter->type);
                seq_printf(m, "%0*lx %c %s\t[%s]\n",
                           (int)(2*sizeof(void*)),
-                           iter->value, iter->type, iter->name,
+                           iter->value, type, iter->name, iter->module_name);
-                           module_name(iter->owner));
+        } else
-        else
                seq_printf(m, "%0*lx %c %s\n",
                           (int)(2*sizeof(void*)),
                           iter->value, iter->type, iter->name);
@@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
        return ret;
 }
-static int kallsyms_release(struct inode *inode, struct file *file)
-{
-        struct seq_file *m = (struct seq_file *)file->private_data;
-        kfree(m->private);
-        return seq_release(inode, file);
-}
 static const struct file_operations kallsyms_operations = {
        .open = kallsyms_open,
        .read = seq_read,
        .llseek = seq_lseek,
-        .release = kallsyms_release,
+        .release = seq_release_private,
 };
 static int __init kallsyms_init(void)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a59c8a01ae0..25db14b89e82 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
        memset(&prstatus, 0, sizeof(prstatus));
        prstatus.pr_pid = current->pid;
        elf_core_copy_regs(&prstatus.pr_reg, regs);
-        buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-                                sizeof(prstatus));
+                              &prstatus, sizeof(prstatus));
        final_note(buf);
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 796276141e51..49cc4b9c1a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -23,7 +23,6 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/kmod.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/mnt_namespace.h>
 #include <linux/completion.h>
@@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data)
        /* We can run anywhere, unlike our parent keventd(). */
        set_cpus_allowed(current, CPU_MASK_ALL);
+        /*
+         * Our parent is keventd, which runs with elevated scheduling priority.
+         * Avoid propagating that into the userspace child.
+         */
+        set_user_nice(current, 0);
        retval = -EPERM;
        if (current->fs->root)
                retval = kernel_execve(sub_info->path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d25a9ada3f8e..9e47d8c493f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,16 +35,19 @@
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/stddef.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/kdebug.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
-#include <asm/kdebug.h>
+#include <asm/uaccess.h>
 #define KPROBE_HASH_BITS 6
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static atomic_t kprobe_count;
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobe_enabled;
 DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
-      retry:
+ retry:
-        hlist_for_each(pos, &kprobe_insn_pages) {
+        hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
-                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
                if (kip->nused < INSNS_PER_PAGE) {
                        int i;
                        for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
        }
        /* All out of space.  Need to allocate a new page. Use slot 0. */
        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
-        if (!kip) {
+        if (!kip)
                return NULL;
-        }
        /*
         * Use module_alloc so this page is within +/- 2GB of where the
@@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void)
        if (check_safety() != 0)
                return -EAGAIN;
-        hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+        hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
                int i;
-                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;      /* we will collect all garbages */
@@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
-        hlist_for_each(pos, &kprobe_insn_pages) {
+        hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
-                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
                if (kip->insns <= slot &&
                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
@@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
                        break;
                }
        }
-        if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+        if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
                collect_garbage_slots();
-        }
 }
 #endif
@@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
                        reset_kprobe_instance();
                }
        }
-        return;
 }
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
@@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 }
 /* Called with kretprobe_lock held */
-struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
-{
-        struct hlist_node *node;
-        struct kretprobe_instance *ri;
-        hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
-                return ri;
-        return NULL;
-}
-/* Called with kretprobe_lock held */
-static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
-                                                              *rp)
-{
-        struct hlist_node *node;
-        struct kretprobe_instance *ri;
-        hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
-                return ri;
-        return NULL;
-}
-/* Called with kretprobe_lock held */
-void __kprobes add_rp_inst(struct kretprobe_instance *ri)
-{
-        /*
-         * Remove rp inst off the free list -
-         * Add it back when probed function returns
-         */
-        hlist_del(&ri->uflist);
-        /* Add rp inst onto table */
-        INIT_HLIST_NODE(&ri->hlist);
-        hlist_add_head(&ri->hlist,
-                        &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
-        /* Also add this rp inst to the used list. */
-        INIT_HLIST_NODE(&ri->uflist);
-        hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-}
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
                                struct hlist_head *head)
 {
@@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 static inline void free_rp_inst(struct kretprobe *rp)
 {
        struct kretprobe_instance *ri;
-        while ((ri = get_free_rp_inst(rp)) != NULL) {
+        struct hlist_node *pos, *next;
+        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
                hlist_del(&ri->uflist);
                kfree(ri);
        }
@@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
-        if (addr >= (unsigned long)__kprobes_text_start
+        if (addr >= (unsigned long)__kprobes_text_start &&
-                && addr < (unsigned long)__kprobes_text_end)
+            addr < (unsigned long)__kprobes_text_end)
                return -EINVAL;
        return 0;
 }
@@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p,
                return -EINVAL;
        p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
-        if ((!kernel_text_address((unsigned long) p->addr)) ||
+        if (!kernel_text_address((unsigned long) p->addr) ||
-                in_kprobes_functions((unsigned long) p->addr))
+            in_kprobes_functions((unsigned long) p->addr))
                return -EINVAL;
        p->mod_refcounted = 0;
-        /* Check are we probing a module */
-        if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+        /*
+         * Check if are we probing a module.
+         */
+        probed_mod = module_text_address((unsigned long) p->addr);
+        if (probed_mod) {
                struct module *calling_mod = module_text_address(called_from);
-                /* We must allow modules to probe themself and
+                /*
-                 * in this case avoid incrementing the module refcount,
+                 * We must allow modules to probe themself and in this case
-                 * so as to allow unloading of self probing modules.
+                 * avoid incrementing the module refcount, so as to allow
+                 * unloading of self probing modules.
                 */
-                if (calling_mod && (calling_mod != probed_mod)) {
+                if (calling_mod && calling_mod != probed_mod) {
                        if (unlikely(!try_module_get(probed_mod)))
                                return -EINVAL;
                        p->mod_refcounted = 1;
@@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
                goto out;
        }
-        if ((ret = arch_prepare_kprobe(p)) != 0)
+        ret = arch_prepare_kprobe(p);
+        if (ret)
                goto out;
        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
-        if (atomic_add_return(1, &kprobe_count) == \
+        if (kprobe_enabled) {
+                if (atomic_add_return(1, &kprobe_count) == \
                                (ARCH_INACTIVE_KPROBE_COUNT + 1))
-                register_page_fault_notifier(&kprobe_page_fault_nb);
+                        register_page_fault_notifier(&kprobe_page_fault_nb);
-        arch_arm_kprobe(p);
+                arch_arm_kprobe(p);
+        }
 out:
        mutex_unlock(&kprobe_mutex);
@@ -616,8 +586,7 @@ out:
 int __kprobes register_kprobe(struct kprobe *p)
 {
-        return __register_kprobe(p,
+        return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
-                (unsigned long)__builtin_return_address(0));
 }
 void __kprobes unregister_kprobe(struct kprobe *p)
@@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
                return;
        }
 valid_p:
-        if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
+        if (old_p == p ||
-                (p->list.next == &old_p->list) &&
+            (old_p->pre_handler == aggr_pre_handler &&
-                (p->list.prev == &old_p->list))) {
+             p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
-                /* Only probe on the hash list */
+                /*
-                arch_disarm_kprobe(p);
+                 * Only probe on the hash list. Disarm only if kprobes are
+                 * enabled - otherwise, the breakpoint would already have
+                 * been removed. We save on flushing icache.
+                 */
+                if (kprobe_enabled)
+                        arch_disarm_kprobe(p);
                hlist_del_rcu(&old_p->hlist);
                cleanup_p = 1;
        } else {
@@ -656,9 +630,11 @@ valid_p:
        mutex_unlock(&kprobe_mutex);
        synchronize_sched();
-        if (p->mod_refcounted &&
+        if (p->mod_refcounted) {
-            (mod = module_text_address((unsigned long)p->addr)))
+                mod = module_text_address((unsigned long)p->addr);
-                module_put(mod);
+                if (mod)
+                        module_put(mod);
+        }
        if (cleanup_p) {
                if (p != old_p) {
@@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
        /*TODO: consider to only swap the RA after the last pre_handler fired */
        spin_lock_irqsave(&kretprobe_lock, flags);
-        arch_prepare_kretprobe(rp, regs);
+        if (!hlist_empty(&rp->free_instances)) {
+                struct kretprobe_instance *ri;
+                ri = hlist_entry(rp->free_instances.first,
+                                 struct kretprobe_instance, uflist);
+                ri->rp = rp;
+                ri->task = current;
+                arch_prepare_kretprobe(ri, regs);
+                /* XXX(hch): why is there no hlist_move_head? */
+                hlist_del(&ri->uflist);
+                hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+                hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+        } else
+                rp->nmissed++;
        spin_unlock_irqrestore(&kretprobe_lock, flags);
        return 0;
 }
@@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
        unsigned long flags;
        struct kretprobe_instance *ri;
+        struct hlist_node *pos, *next;
        unregister_kprobe(&rp->kp);
        /* No race here */
        spin_lock_irqsave(&kretprobe_lock, flags);
-        while ((ri = get_used_rp_inst(rp)) != NULL) {
+        hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
                ri->rp = NULL;
                hlist_del(&ri->uflist);
        }
@@ -816,6 +808,9 @@ static int __init init_kprobes(void)
        }
        atomic_set(&kprobe_count, 0);
+        /* By default, kprobes are enabled */
+        kprobe_enabled = true;
        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
@@ -825,7 +820,7 @@ static int __init init_kprobes(void)
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-               const char *sym, int offset,char *modname)
+                const char *sym, int offset,char *modname)
 {
        char *kprobe_type;
@@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        struct kprobe *p, *kp;
        const char *sym = NULL;
        unsigned int i = *(loff_t *) v;
-        unsigned long size, offset = 0;
+        unsigned long offset = 0;
        char *modname, namebuf[128];
        head = &kprobe_table[i];
        preempt_disable();
        hlist_for_each_entry_rcu(p, node, head, hlist) {
-                sym = kallsyms_lookup((unsigned long)p->addr, &size,
+                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
                if (p->pre_handler == aggr_pre_handler) {
                        list_for_each_entry_rcu(kp, &p->list, list)
@@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = {
        .release        = seq_release,
 };
+static void __kprobes enable_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        mutex_lock(&kprobe_mutex);
+        /* If kprobes are already enabled, just return */
+        if (kprobe_enabled)
+                goto already_enabled;
+        /*
+         * Re-register the page fault notifier only if there are any
+         * active probes at the time of enabling kprobes globally
+         */
+        if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
+                register_page_fault_notifier(&kprobe_page_fault_nb);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist)
+                        arch_arm_kprobe(p);
+        }
+        kprobe_enabled = true;
+        printk(KERN_INFO "Kprobes globally enabled\n");
+already_enabled:
+        mutex_unlock(&kprobe_mutex);
+        return;
+}
+static void __kprobes disable_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        mutex_lock(&kprobe_mutex);
+        /* If kprobes are already disabled, just return */
+        if (!kprobe_enabled)
+                goto already_disabled;
+        kprobe_enabled = false;
+        printk(KERN_INFO "Kprobes globally disabled\n");
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                        if (!arch_trampoline_kprobe(p))
+                                arch_disarm_kprobe(p);
+                }
+        }
+        mutex_unlock(&kprobe_mutex);
+        /* Allow all currently running kprobes to complete */
+        synchronize_sched();
+        mutex_lock(&kprobe_mutex);
+        /* Unconditionally unregister the page_fault notifier */
+        unregister_page_fault_notifier(&kprobe_page_fault_nb);
+already_disabled:
+        mutex_unlock(&kprobe_mutex);
+        return;
+}
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+               char __user *user_buf, size_t count, loff_t *ppos)
+{
+        char buf[3];
+        if (kprobe_enabled)
+                buf[0] = '1';
+        else
+                buf[0] = '0';
+        buf[1] = '\n';
+        buf[2] = 0x00;
+        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+static ssize_t write_enabled_file_bool(struct file *file,
+               const char __user *user_buf, size_t count, loff_t *ppos)
+{
+        char buf[32];
+        int buf_size;
+        buf_size = min(count, (sizeof(buf)-1));
+        if (copy_from_user(buf, user_buf, buf_size))
+                return -EFAULT;
+        switch (buf[0]) {
+        case 'y':
+        case 'Y':
+        case '1':
+                enable_all_kprobes();
+                break;
+        case 'n':
+        case 'N':
+        case '0':
+                disable_all_kprobes();
+                break;
+        }
+        return count;
+}
+static struct file_operations fops_kp = {
+        .read =         read_enabled_file_bool,
+        .write =        write_enabled_file_bool,
+};
 static int __kprobes debugfs_kprobe_init(void)
 {
        struct dentry *dir, *file;
+        unsigned int value = 1;
        dir = debugfs_create_dir("kprobes", NULL);
        if (!dir)
                return -ENOMEM;
-        file = debugfs_create_file("list", 0444, dir , 0 ,
+        file = debugfs_create_file("list", 0444, dir, NULL,
                                &debugfs_kprobes_operations);
        if (!file) {
                debugfs_remove(dir);
                return -ENOMEM;
        }
+        file = debugfs_create_file("enabled", 0600, dir,
+                                        &value, &fops_kp);
+        if (!file) {
+                debugfs_remove(dir);
+                return -ENOMEM;
+        }
        return 0;
 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7065a687ac54..1a5ff2211d88 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace)
        trace->entries = stack_trace + nr_stack_trace_entries;
        trace->skip = 3;
-        trace->all_contexts = 0;
-        save_stack_trace(trace, NULL);
+        save_stack_trace(trace);
        trace->max_entries = trace->nr_entries;
@@ -341,10 +340,7 @@ static const char *usage_str[] =
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 {
-        unsigned long offs, size;
+        return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
-        char *modname;
-        return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
 }
 void
@@ -1313,8 +1309,9 @@ out_unlock_set:
 /*
 * Look up a dependency chain. If the key is not present yet then
- * add it and return 0 - in this case the new dependency chain is
+ * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 1.
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
 */
 static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
 {
@@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 * Mark a lock with a usage bit, and validate the state transition:
 */
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
-                     enum lock_usage_bit new_bit, unsigned long ip)
+                     enum lock_usage_bit new_bit)
 {
        unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        this->class->usage_mask |= new_mask;
-#ifdef CONFIG_TRACE_IRQFLAGS
-        if (new_bit == LOCK_ENABLED_HARDIRQS ||
-                        new_bit == LOCK_ENABLED_HARDIRQS_READ)
-                ip = curr->hardirq_enable_ip;
-        else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
-                        new_bit == LOCK_ENABLED_SOFTIRQS_READ)
-                ip = curr->softirq_enable_ip;
-#endif
        if (!save_trace(this->class->usage_traces + new_bit))
                return 0;
@@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 * Mark all held locks with a usage bit:
 */
 static int
-mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+mark_held_locks(struct task_struct *curr, int hardirq)
 {
        enum lock_usage_bit usage_bit;
        struct held_lock *hlock;
@@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
                        else
                                usage_bit = LOCK_ENABLED_SOFTIRQS;
                }
-                if (!mark_lock(curr, hlock, usage_bit, ip))
+                if (!mark_lock(curr, hlock, usage_bit))
                        return 0;
        }
@@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void)
         * We are going to turn hardirqs on, so set the
         * usage bit for all held locks:
         */
-        if (!mark_held_locks(curr, 1, ip))
+        if (!mark_held_locks(curr, 1))
                return;
        /*
         * If we have softirqs enabled, then set the usage
@@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void)
         * this bit from being set before)
         */
        if (curr->softirqs_enabled)
-                if (!mark_held_locks(curr, 0, ip))
+                if (!mark_held_locks(curr, 0))
                        return;
        curr->hardirq_enable_ip = ip;
@@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip)
         * enabled too:
         */
        if (curr->hardirqs_enabled)
-                mark_held_locks(curr, 0, ip);
+                mark_held_locks(curr, 0);
 }
 /*
@@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                if (read) {
                        if (curr->hardirq_context)
                                if (!mark_lock(curr, hlock,
-                                                LOCK_USED_IN_HARDIRQ_READ, ip))
+                                                LOCK_USED_IN_HARDIRQ_READ))
                                        return 0;
                        if (curr->softirq_context)
                                if (!mark_lock(curr, hlock,
-                                                LOCK_USED_IN_SOFTIRQ_READ, ip))
+                                                LOCK_USED_IN_SOFTIRQ_READ))
                                        return 0;
                } else {
                        if (curr->hardirq_context)
-                                if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+                                if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
                                        return 0;
                        if (curr->softirq_context)
-                                if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+                                if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
                                        return 0;
                }
        }
        if (!hardirqs_off) {
                if (read) {
                        if (!mark_lock(curr, hlock,
-                                        LOCK_ENABLED_HARDIRQS_READ, ip))
+                                        LOCK_ENABLED_HARDIRQS_READ))
                                return 0;
                        if (curr->softirqs_enabled)
                                if (!mark_lock(curr, hlock,
-                                                LOCK_ENABLED_SOFTIRQS_READ, ip))
+                                                LOCK_ENABLED_SOFTIRQS_READ))
                                        return 0;
                } else {
                        if (!mark_lock(curr, hlock,
-                                        LOCK_ENABLED_HARDIRQS, ip))
+                                        LOCK_ENABLED_HARDIRQS))
                                return 0;
                        if (curr->softirqs_enabled)
                                if (!mark_lock(curr, hlock,
-                                                LOCK_ENABLED_SOFTIRQS, ip))
+                                                LOCK_ENABLED_SOFTIRQS))
                                        return 0;
                }
        }
 #endif
        /* mark it as used: */
-        if (!mark_lock(curr, hlock, LOCK_USED, ip))
+        if (!mark_lock(curr, hlock, LOCK_USED))
                return 0;
 out_calc_hash:
        /*
diff --git a/kernel/module.c b/kernel/module.c
index 1eb8ca565ba0..d36e45477fac 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/init.h>
+#include <linux/kallsyms.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size)
 {
        /* Reallocation required? */
        if (pcpu_num_used + 1 > pcpu_num_allocated) {
-                int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
+                int *new;
-                                   GFP_KERNEL);
+                new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
+                               GFP_KERNEL);
                if (!new)
                        return 0;
-                memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
                pcpu_num_allocated *= 2;
-                kfree(pcpu_size);
                pcpu_size = new;
        }
@@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 #ifdef CONFIG_KALLSYMS
-int is_exported(const char *name, const struct module *mod)
+static int is_exported(const char *name, const struct module *mod)
 {
        if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
                return 1;
@@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod,
        if (!best)
                return NULL;
-        *size = nextval - mod->symtab[best].st_value;
+        if (size)
-        *offset = addr - mod->symtab[best].st_value;
+                *size = nextval - mod->symtab[best].st_value;
+        if (offset)
+                *offset = addr - mod->symtab[best].st_value;
        return mod->strtab + mod->symtab[best].st_name;
 }
@@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr,
        return NULL;
 }
-struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+int lookup_module_symbol_name(unsigned long addr, char *symname)
-                                char *type, char *name, size_t namelen)
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry(mod, &modules, list) {
+                if (within(addr, mod->module_init, mod->init_size) ||
+                    within(addr, mod->module_core, mod->core_size)) {
+                        const char *sym;
+                        sym = get_ksymbol(mod, addr, NULL, NULL);
+                        if (!sym)
+                                goto out;
+                        strlcpy(symname, sym, KSYM_NAME_LEN + 1);
+                        mutex_unlock(&module_mutex);
+                        return 0;
+                }
+        }
+out:
+        mutex_unlock(&module_mutex);
+        return -ERANGE;
+}
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+                        unsigned long *offset, char *modname, char *name)
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry(mod, &modules, list) {
+                if (within(addr, mod->module_init, mod->init_size) ||
+                    within(addr, mod->module_core, mod->core_size)) {
+                        const char *sym;
+                        sym = get_ksymbol(mod, addr, size, offset);
+                        if (!sym)
+                                goto out;
+                        if (modname)
+                                strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
+                        if (name)
+                                strlcpy(name, sym, KSYM_NAME_LEN + 1);
+                        mutex_unlock(&module_mutex);
+                        return 0;
+                }
+        }
+out:
+        mutex_unlock(&module_mutex);
+        return -ERANGE;
+}
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+                        char *name, char *module_name, int *exported)
 {
        struct module *mod;
@@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
                        *value = mod->symtab[symnum].st_value;
                        *type = mod->symtab[symnum].st_info;
                        strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
-                                namelen);
+                                KSYM_NAME_LEN + 1);
+                        strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
+                        *exported = is_exported(name, mod);
                        mutex_unlock(&module_mutex);
-                        return mod;
+                        return 0;
                }
                symnum -= mod->num_symtab;
        }
        mutex_unlock(&module_mutex);
-        return NULL;
+        return -ERANGE;
 }
 static unsigned long mod_find_symname(struct module *mod, const char *name)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6f6bbb..1bc4b55241a8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk)
 /*
 * creates a copy of "orig" with refcount 1.
- * This does not grab references to the contained namespaces,
- * so that needs to be done by dup_namespaces.
 */
-static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
 {
        struct nsproxy *ns;
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
 }
 /*
- * copies the nsproxy, setting refcount to 1, and grabbing a
+ * Create new nsproxy and all of its the associated namespaces.
- * reference to all contained namespaces.  Called from
+ * Return the newly created nsproxy.  Do not attach this to the task,
- * sys_unshare()
+ * leave it to the caller to do proper locking and attach it to task.
 */
-struct nsproxy *dup_namespaces(struct nsproxy *orig)
+static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
+                        struct fs_struct *new_fs)
 {
-        struct nsproxy *ns = clone_namespaces(orig);
+        struct nsproxy *new_nsp;
-        if (ns) {
+        new_nsp = clone_nsproxy(tsk->nsproxy);
-                if (ns->mnt_ns)
+        if (!new_nsp)
-                        get_mnt_ns(ns->mnt_ns);
+                return ERR_PTR(-ENOMEM);
-                if (ns->uts_ns)
-                        get_uts_ns(ns->uts_ns);
-                if (ns->ipc_ns)
-                        get_ipc_ns(ns->ipc_ns);
-                if (ns->pid_ns)
-                        get_pid_ns(ns->pid_ns);
-        }
-        return ns;
+        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+        if (IS_ERR(new_nsp->mnt_ns))
+                goto out_ns;
+        new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+        if (IS_ERR(new_nsp->uts_ns))
+                goto out_uts;
+        new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+        if (IS_ERR(new_nsp->ipc_ns))
+                goto out_ipc;
+        new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
+        if (IS_ERR(new_nsp->pid_ns))
+                goto out_pid;
+        return new_nsp;
+out_pid:
+        if (new_nsp->ipc_ns)
+                put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+        if (new_nsp->uts_ns)
+                put_uts_ns(new_nsp->uts_ns);
+out_uts:
+        if (new_nsp->mnt_ns)
+                put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+        kfree(new_nsp);
+        return ERR_PTR(-ENOMEM);
 }
 /*
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk)
        if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
                return 0;
-        new_ns = clone_namespaces(old_ns);
+        if (!capable(CAP_SYS_ADMIN)) {
-        if (!new_ns) {
+                err = -EPERM;
-                err = -ENOMEM;
                goto out;
        }
-        tsk->nsproxy = new_ns;
+        new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+        if (IS_ERR(new_ns)) {
-        err = copy_mnt_ns(flags, tsk);
+                err = PTR_ERR(new_ns);
-        if (err)
+                goto out;
-                goto out_ns;
+        }
-        err = copy_utsname(flags, tsk);
-        if (err)
-                goto out_uts;
-        err = copy_ipcs(flags, tsk);
-        if (err)
-                goto out_ipc;
-        err = copy_pid_ns(flags, tsk);
-        if (err)
-                goto out_pid;
+        tsk->nsproxy = new_ns;
 out:
        put_nsproxy(old_ns);
        return err;
-out_pid:
-        if (new_ns->ipc_ns)
-                put_ipc_ns(new_ns->ipc_ns);
-out_ipc:
-        if (new_ns->uts_ns)
-                put_uts_ns(new_ns->uts_ns);
-out_uts:
-        if (new_ns->mnt_ns)
-                put_mnt_ns(new_ns->mnt_ns);
-out_ns:
-        tsk->nsproxy = old_ns;
-        kfree(new_ns);
-        goto out;
 }
 void free_nsproxy(struct nsproxy *ns)
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns)
                put_pid_ns(ns->pid_ns);
        kfree(ns);
 }
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On sucess, returns the new nsproxy and a reference to old nsproxy
+ * to make sure it stays around.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+                struct nsproxy **new_nsp, struct fs_struct *new_fs)
+{
+        struct nsproxy *old_ns = current->nsproxy;
+        int err = 0;
+        if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+                return 0;
+#ifndef CONFIG_IPC_NS
+        if (unshare_flags & CLONE_NEWIPC)
+                return -EINVAL;
+#endif
+#ifndef CONFIG_UTS_NS
+        if (unshare_flags & CLONE_NEWUTS)
+                return -EINVAL;
+#endif
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        get_nsproxy(old_ns);
+        *new_nsp = create_new_namespaces(unshare_flags, current,
+                                new_fs ? new_fs : current->fs);
+        if (IS_ERR(*new_nsp)) {
+                err = PTR_ERR(*new_nsp);
+                put_nsproxy(old_ns);
+        }
+        return err;
+}
diff --git a/kernel/params.c b/kernel/params.c
index 312172320b4c..e61c46c97ce7 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
        return param_get_bool(buffer, &dummy);
 }
-/* We cheat here and temporarily mangle the string. */
+/* We break the rule and mangle the string. */
 static int param_array(const char *name,
                       const char *val,
                       unsigned int min, unsigned int max,
diff --git a/kernel/pid.c b/kernel/pid.c
index 9c80bc23d6b8..d3ad724afa83 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
-int copy_pid_ns(int flags, struct task_struct *tsk)
+struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
 {
-        struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+        BUG_ON(!old_ns);
-        int err = 0;
-        if (!old_ns)
-                return 0;
        get_pid_ns(old_ns);
-        return err;
+        return old_ns;
 }
 void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 657f77697415..1de710e18373 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk,
        maxfire = 20;
        tsk->it_prof_expires = cputime_zero;
        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_entry(timers->next,
+                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk,
        maxfire = 20;
        tsk->it_virt_expires = cputime_zero;
        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_entry(timers->next,
+                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk,
        maxfire = 20;
        tsk->it_sched_expires = 0;
        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_entry(timers->next,
+                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || tsk->sched_time < t->expires.sched) {
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk,
        maxfire = 20;
        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_entry(timers->next,
+                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
        maxfire = 20;
        virt_expires = cputime_zero;
        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_entry(timers->next,
+                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk,
        maxfire = 20;
        sched_expires = 0;
        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_entry(timers->next,
+                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || sched_time < t->expires.sched) {
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
         */
        head = &tsk->signal->cpu_timers[clock_idx];
        if (list_empty(head) ||
-            cputime_ge(list_entry(head->next,
+            cputime_ge(list_first_entry(head,
                                  struct cpu_timer_list, entry)->expires.cpu,
                       *newval)) {
                /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca71978..588c99da0307 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,7 +31,6 @@
 * POSIX clocks & timers
 */
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0eb5c420e8ed..088419387388 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -8,7 +8,6 @@
 #undef DEBUG
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
@@ -25,10 +24,9 @@
 static inline int freezeable(struct task_struct * p)
 {
-        if ((p == current) || 
+        if ((p == current) ||
            (p->flags & PF_NOFREEZE) ||
-            (p->exit_state == EXIT_ZOMBIE) ||
+            (p->exit_state != 0))
-            (p->exit_state == EXIT_DEAD))
                return 0;
        return 1;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 128da11f01c2..b7039772b05c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
-#include <linux/smp_lock.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e83ed9945a80..b8b235cc19d1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,7 +12,6 @@
 */
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
 #include <linux/version.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b47e59248df..0bbdeac2810c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -20,7 +20,6 @@
 #include <linux/mm.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
-#include <linux/smp_lock.h>
 #include <linux/console.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -931,8 +930,16 @@ void register_console(struct console *console)
 {
        int i;
        unsigned long flags;
+        struct console *bootconsole = NULL;
-        if (preferred_console < 0)
+        if (console_drivers) {
+                if (console->flags & CON_BOOT)
+                        return;
+                if (console_drivers->flags & CON_BOOT)
+                        bootconsole = console_drivers;
+        }
+        if (preferred_console < 0 || bootconsole || !console_drivers)
                preferred_console = selected_console;
        /*
@@ -978,8 +985,11 @@ void register_console(struct console *console)
        if (!(console->flags & CON_ENABLED))
                return;
-        if (console_drivers && (console_drivers->flags & CON_BOOT)) {
+        if (bootconsole) {
-                unregister_console(console_drivers);
+                printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
+                       bootconsole->name, bootconsole->index,
+                       console->name, console->index);
+                unregister_console(bootconsole);
                console->flags &= ~CON_PRINTBUFFER;
        }
@@ -1030,16 +1040,11 @@ int unregister_console(struct console *console)
                }
        }
-        /* If last console is removed, we re-enable picking the first
+        /*
-         * one that gets registered. Without that, pmac early boot console
-         * would prevent fbcon from taking over.
-         *
         * If this isn't the last console and it has CON_CONSDEV set, we
         * need to set it on the next preferred console.
         */
-        if (console_drivers == NULL)
+        if (console_drivers != NULL && console->flags & CON_CONSDEV)
-                preferred_console = selected_console;
-        else if (console->flags & CON_CONSDEV)
                console_drivers->flags |= CON_CONSDEV;
        release_console_sem();
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index bcd14e83ef39..55ba82a85a66 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = {
        .name = "sched"
 };
-static struct rcu_torture_ops *torture_ops[] =
-        { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
-          &sched_ops, NULL };
 /*
 * RCU torture writer kthread.  Repeatedly substitutes a new structure
 * for that pointed to by rcu_torture_current, freeing the old structure
@@ -534,7 +530,7 @@ rcu_torture_writer(void *arg)
                rp->rtort_mbtest = 1;
                rcu_assign_pointer(rcu_torture_current, rp);
                smp_wmb();
-                if (old_rp != NULL) {
+                if (old_rp) {
                        i = old_rp->rtort_pipe_count;
                        if (i > RCU_TORTURE_PIPE_LEN)
                                i = RCU_TORTURE_PIPE_LEN;
@@ -685,7 +681,7 @@ rcu_torture_printk(char *page)
                               atomic_read(&rcu_torture_wcount[i]));
        }
        cnt += sprintf(&page[cnt], "\n");
-        if (cur_ops->stats != NULL)
+        if (cur_ops->stats)
                cnt += cur_ops->stats(&page[cnt]);
        return cnt;
 }
@@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void)
        set_cpus_allowed(current, tmp_mask);
-        if (reader_tasks != NULL) {
+        if (reader_tasks) {
                for (i = 0; i < nrealreaders; i++)
                        if (reader_tasks[i])
                                set_cpus_allowed(reader_tasks[i], tmp_mask);
        }
-        if (fakewriter_tasks != NULL) {
+        if (fakewriter_tasks) {
                for (i = 0; i < nfakewriters; i++)
                        if (fakewriter_tasks[i])
                                set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
@@ -808,21 +804,21 @@ rcu_torture_cleanup(void)
        int i;
        fullstop = 1;
-        if (shuffler_task != NULL) {
+        if (shuffler_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
                kthread_stop(shuffler_task);
        }
        shuffler_task = NULL;
-        if (writer_task != NULL) {
+        if (writer_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
                kthread_stop(writer_task);
        }
        writer_task = NULL;
-        if (reader_tasks != NULL) {
+        if (reader_tasks) {
                for (i = 0; i < nrealreaders; i++) {
-                        if (reader_tasks[i] != NULL) {
+                        if (reader_tasks[i]) {
                                VERBOSE_PRINTK_STRING(
                                        "Stopping rcu_torture_reader task");
                                kthread_stop(reader_tasks[i]);
@@ -834,9 +830,9 @@ rcu_torture_cleanup(void)
        }
        rcu_torture_current = NULL;
-        if (fakewriter_tasks != NULL) {
+        if (fakewriter_tasks) {
                for (i = 0; i < nfakewriters; i++) {
-                        if (fakewriter_tasks[i] != NULL) {
+                        if (fakewriter_tasks[i]) {
                                VERBOSE_PRINTK_STRING(
                                        "Stopping rcu_torture_fakewriter task");
                                kthread_stop(fakewriter_tasks[i]);
@@ -847,7 +843,7 @@ rcu_torture_cleanup(void)
                fakewriter_tasks = NULL;
        }
-        if (stats_task != NULL) {
+        if (stats_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
                kthread_stop(stats_task);
        }
@@ -858,7 +854,7 @@ rcu_torture_cleanup(void)
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-        if (cur_ops->cleanup != NULL)
+        if (cur_ops->cleanup)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
                rcu_torture_print_module_parms("End of test: FAILURE");
@@ -866,27 +862,28 @@ rcu_torture_cleanup(void)
                rcu_torture_print_module_parms("End of test: SUCCESS");
 }
-static int
+static int __init
 rcu_torture_init(void)
 {
        int i;
        int cpu;
        int firsterr = 0;
+        static struct rcu_torture_ops *torture_ops[] =
+                { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+                  &srcu_ops, &sched_ops, };
        /* Process args and tell the world that the torturer is on the job. */
+        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
-        for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
                cur_ops = torture_ops[i];
-                if (strcmp(torture_type, cur_ops->name) == 0) {
+                if (strcmp(torture_type, cur_ops->name) == 0)
                        break;
-                }
        }
-        if (cur_ops == NULL) {
+        if (i == ARRAY_SIZE(torture_ops)) {
                printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
                       torture_type);
                return (-EINVAL);
        }
-        if (cur_ops->init != NULL)
+        if (cur_ops->init)
                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
        if (nreaders >= 0)
@@ -899,7 +896,7 @@ rcu_torture_init(void)
        /* Set up the freelist. */
        INIT_LIST_HEAD(&rcu_torture_freelist);
-        for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+        for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
                rcu_tortures[i].rtort_mbtest = 0;
                list_add_tail(&rcu_tortures[i].rtort_free,
                              &rcu_torture_freelist);
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 291ded556aa0..9a87886b022e 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem)
        int ret = __down_write_trylock(sem);
        if (ret == 1)
-                rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+                rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
        return ret;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 0227f1625a75..a3a04085e794 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
 #include <asm/unistd.h>
 /*
@@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
                (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 #define TASK_PREEMPTS_CURR(p, rq) \
-        ((p)->prio < (rq)->curr->prio)
+        (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
 #define SCALE_PRIO(x, prio) \
        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
                return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+        return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+        sg->__cpu_power += val;
+        sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
 /*
 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 * to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
        unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
        unsigned long cpu_load[3];
+        unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+        unsigned char in_nohz_recently;
+#endif
 #endif
        unsigned long long nr_switches;
@@ -278,7 +304,7 @@ struct rq {
        struct lock_class_key rq_lock_key;
 };
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 static inline int cpu_of(struct rq *rq)
 {
@@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p)
        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
 }
+static void resched_cpu(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        if (!spin_trylock_irqsave(&rq->lock, flags))
+                return;
+        resched_task(cpu_curr(cpu));
+        spin_unlock_irqrestore(&rq->lock, flags);
+}
 #else
 static inline void resched_task(struct task_struct *p)
 {
@@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                }
                /* Adjust by relative CPU power of the group */
-                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                avg_load = sg_div_cpu_power(group,
+                                avg_load * SCHED_LOAD_SCALE);
                if (local_group) {
                        this_load = avg_load;
@@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p)
        struct sched_domain *sd;
        int i;
-        if (idle_cpu(cpu))
+        /*
+         * If it is idle, then it is the best cpu to run this task.
+         *
+         * This cpu is also the best, if it has more than one task already.
+         * Siblings must be also busy(in most cases) as they didn't already
+         * pickup the extra load from this cpu and hence we need not check
+         * sibling runqueue info. This will avoid the checks and cache miss
+         * penalities associated with that.
+         */
+        if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
                return cpu;
        for_each_domain(cpu, sd) {
@@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                }
                total_load += avg_load;
-                total_pwr += group->cpu_power;
+                total_pwr += group->__cpu_power;
                /* Adjust by relative CPU power of the group */
-                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                avg_load = sg_div_cpu_power(group,
+                                avg_load * SCHED_LOAD_SCALE);
-                group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
                if (local_group) {
                        this_load = avg_load;
@@ -2468,8 +2516,8 @@ group_next:
        max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * busiest->cpu_power,
+        *imbalance = min(max_pull * busiest->__cpu_power,
-                                (avg_load - this_load) * this->cpu_power)
+                                (avg_load - this_load) * this->__cpu_power)
                        / SCHED_LOAD_SCALE;
        /*
@@ -2503,28 +2551,29 @@ small_imbalance:
                 * moving them.
                 */
-                pwr_now += busiest->cpu_power *
+                pwr_now += busiest->__cpu_power *
-                        min(busiest_load_per_task, max_load);
+                                min(busiest_load_per_task, max_load);
-                pwr_now += this->cpu_power *
+                pwr_now += this->__cpu_power *
-                        min(this_load_per_task, this_load);
+                                min(this_load_per_task, this_load);
                pwr_now /= SCHED_LOAD_SCALE;
                /* Amount of load we'd subtract */
-                tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                tmp = sg_div_cpu_power(busiest,
-                        busiest->cpu_power;
+                                busiest_load_per_task * SCHED_LOAD_SCALE);
                if (max_load > tmp)
-                        pwr_move += busiest->cpu_power *
+                        pwr_move += busiest->__cpu_power *
                                min(busiest_load_per_task, max_load - tmp);
                /* Amount of load we'd add */
-                if (max_load * busiest->cpu_power <
+                if (max_load * busiest->__cpu_power <
                                busiest_load_per_task * SCHED_LOAD_SCALE)
-                        tmp = max_load * busiest->cpu_power / this->cpu_power;
+                        tmp = sg_div_cpu_power(this,
+                                        max_load * busiest->__cpu_power);
                else
-                        tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                        tmp = sg_div_cpu_power(this,
-                                this->cpu_power;
+                                busiest_load_per_task * SCHED_LOAD_SCALE);
-                pwr_move += this->cpu_power *
+                pwr_move += this->__cpu_power *
-                        min(this_load_per_task, this_load + tmp);
+                                min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
                /* Move if we gain throughput */
@@ -2657,6 +2706,12 @@ redo:
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
+                /*
+                 * some other cpu did the load balance for us.
+                 */
+                if (nr_moved && this_cpu != smp_processor_id())
+                        resched_cpu(this_cpu);
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
                        cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq)
        }
 }
+#ifdef CONFIG_NO_HZ
+static struct {
+        atomic_t load_balancer;
+        cpumask_t  cpu_mask;
+} nohz ____cacheline_aligned = {
+        .load_balancer = ATOMIC_INIT(-1),
+        .cpu_mask = CPU_MASK_NONE,
+};
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
 *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
+        if (stop_tick) {
+                cpu_set(cpu, nohz.cpu_mask);
+                cpu_rq(cpu)->in_nohz_recently = 1;
+                /*
+                 * If we are going offline and still the leader, give up!
+                 */
+                if (cpu_is_offline(cpu) &&
+                    atomic_read(&nohz.load_balancer) == cpu) {
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+                        return 0;
+                }
+                /* time for ilb owner also to sleep */
+                if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+                        if (atomic_read(&nohz.load_balancer) == cpu)
+                                atomic_set(&nohz.load_balancer, -1);
+                        return 0;
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        /* make me the ilb owner */
+                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                                return 1;
+                } else if (atomic_read(&nohz.load_balancer) == cpu)
+                        return 1;
+        } else {
+                if (!cpu_isset(cpu, nohz.cpu_mask))
+                        return 0;
+                cpu_clear(cpu, nohz.cpu_mask);
+                if (atomic_read(&nohz.load_balancer) == cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+        }
+        return 0;
+}
+#endif
+static DEFINE_SPINLOCK(balancing);
+/*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
 * Balancing parameters are set up in arch_init_sched_domains.
 */
-static DEFINE_SPINLOCK(balancing);
+static inline void rebalance_domains(int cpu, enum idle_type idle)
-static void run_rebalance_domains(struct softirq_action *h)
 {
-        int this_cpu = smp_processor_id(), balance = 1;
+        int balance = 1;
-        struct rq *this_rq = cpu_rq(this_cpu);
+        struct rq *rq = cpu_rq(cpu);
        unsigned long interval;
        struct sched_domain *sd;
-        /*
+        /* Earliest time when we have to do rebalance again */
-         * We are idle if there are no processes running. This
-         * is valid even if we are the idle process (SMT).
-         */
-        enum idle_type idle = !this_rq->nr_running ?
-                                SCHED_IDLE : NOT_IDLE;
-        /* Earliest time when we have to call run_rebalance_domains again */
        unsigned long next_balance = jiffies + 60*HZ;
-        for_each_domain(this_cpu, sd) {
+        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
@@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                }
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+                        if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2995,7 +3116,114 @@ out:
                if (!balance)
                        break;
        }
-        this_rq->next_balance = next_balance;
+        rq->next_balance = next_balance;
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int local_cpu = smp_processor_id();
+        struct rq *local_rq = cpu_rq(local_cpu);
+        enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+        rebalance_domains(local_cpu, idle);
+#ifdef CONFIG_NO_HZ
+        /*
+         * If this cpu is the owner for idle load balancing, then do the
+         * balancing on behalf of the other idle cpus whose ticks are
+         * stopped.
+         */
+        if (local_rq->idle_at_tick &&
+            atomic_read(&nohz.load_balancer) == local_cpu) {
+                cpumask_t cpus = nohz.cpu_mask;
+                struct rq *rq;
+                int balance_cpu;
+                cpu_clear(local_cpu, cpus);
+                for_each_cpu_mask(balance_cpu, cpus) {
+                        /*
+                         * If this cpu gets work to do, stop the load balancing
+                         * work being done for other cpus. Next load
+                         * balancing owner will pick it up.
+                         */
+                        if (need_resched())
+                                break;
+                        rebalance_domains(balance_cpu, SCHED_IDLE);
+                        rq = cpu_rq(balance_cpu);
+                        if (time_after(local_rq->next_balance, rq->next_balance))
+                                local_rq->next_balance = rq->next_balance;
+                }
+        }
+#endif
+}
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+        /*
+         * If we were in the nohz mode recently and busy at the current
+         * scheduler tick, then check if we need to nominate new idle
+         * load balancer.
+         */
+        if (rq->in_nohz_recently && !rq->idle_at_tick) {
+                rq->in_nohz_recently = 0;
+                if (atomic_read(&nohz.load_balancer) == cpu) {
+                        cpu_clear(cpu, nohz.cpu_mask);
+                        atomic_set(&nohz.load_balancer, -1);
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        /*
+                         * simple selection for now: Nominate the
+                         * first cpu in the nohz list to be the next
+                         * ilb owner.
+                         *
+                         * TBD: Traverse the sched domains and nominate
+                         * the nearest cpu in the nohz.cpu_mask.
+                         */
+                        int ilb = first_cpu(nohz.cpu_mask);
+                        if (ilb != NR_CPUS)
+                                resched_cpu(ilb);
+                }
+        }
+        /*
+         * If this cpu is idle and doing idle load balancing for all the
+         * cpus with ticks stopped, is it time for that to stop?
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+            cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+                resched_cpu(cpu);
+                return;
+        }
+        /*
+         * If this cpu is idle and the idle load balancing is done by
+         * someone else, then no need raise the SCHED_SOFTIRQ
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+            cpu_isset(cpu, nohz.cpu_mask))
+                return;
+#endif
+        if (time_after_eq(jiffies, rq->next_balance))
+                raise_softirq(SCHED_SOFTIRQ);
 }
 #else
 /*
@@ -3218,16 +3446,17 @@ void scheduler_tick(void)
        unsigned long long now = sched_clock();
        struct task_struct *p = current;
        int cpu = smp_processor_id();
+        int idle_at_tick = idle_cpu(cpu);
        struct rq *rq = cpu_rq(cpu);
        update_cpu_clock(p, rq, now);
-        if (p != rq->idle)
+        if (!idle_at_tick)
                task_running_tick(rq, p);
 #ifdef CONFIG_SMP
        update_load(rq);
-        if (time_after_eq(jiffies, rq->next_balance))
+        rq->idle_at_tick = idle_at_tick;
-                raise_softirq(SCHED_SOFTIRQ);
+        trigger_load_balance(cpu);
 #endif
 }
@@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        struct prio_array *array;
        unsigned long flags;
        struct rq *rq;
-        int oldprio;
+        int delta;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
        rq = task_rq_lock(p, &flags);
-        oldprio = p->prio;
+        delta = prio - p->prio;
        array = p->array;
        if (array)
                dequeue_task(p, array);
@@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                enqueue_task(p, array);
                /*
                 * Reschedule if we are currently running on this runqueue and
-                 * our priority decreased, or if we are not currently running on
+                 * our priority decreased, or if our priority became higher
-                 * this runqueue and our priority is higher than the current's
+                 * than the current's.
                 */
-                if (task_running(rq, p)) {
+                if (TASK_PREEMPTS_CURR(p, rq) ||
-                        if (p->prio > oldprio)
+                                (delta > 0 && task_running(rq, p)))
-                                resched_task(rq->curr);
-                } else if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
        }
        task_rq_unlock(rq, &flags);
@@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice)
                enqueue_task(p, array);
                inc_raw_weighted_load(rq, p);
                /*
-                 * If the task increased its priority or is running and
+                 * Reschedule if we are currently running on this runqueue and
-                 * lowered its priority, then reschedule its CPU:
+                 * our priority decreased, or if our priority became higher
+                 * than the current's.
                 */
-                if (delta < 0 || (delta > 0 && task_running(rq, p)))
+                if (TASK_PREEMPTS_CURR(p, rq) ||
+                                (delta > 0 && task_running(rq, p)))
                        resched_task(rq->curr);
        }
 out_unlock:
@@ -4153,13 +4382,11 @@ recheck:
                __activate_task(p, rq);
                /*
                 * Reschedule if we are currently running on this runqueue and
-                 * our priority decreased, or if we are not currently running on
+                 * our priority decreased, or our priority became higher
-                 * this runqueue and our priority is higher than the current's
+                 * than the current's.
                 */
-                if (task_running(rq, p)) {
+                if (TASK_PREEMPTS_CURR(p, rq) ||
-                        if (p->prio > oldprio)
+                                (task_running(rq, p) && p->prio > oldprio))
-                                resched_task(rq->curr);
-                } else if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
        }
        __task_rq_unlock(rq);
@@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter)
                        show_task(p);
        } while_each_thread(g, p);
+        touch_all_softlockup_watchdogs();
        read_unlock(&tasklist_lock);
        /*
         * Only show locks if all tasks are dumped:
@@ -5304,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                                break;
                        }
-                        if (!group->cpu_power) {
+                        if (!group->__cpu_power) {
                                printk("\n");
                                printk(KERN_ERR "ERROR: domain->cpu_power not "
                                                "set\n");
@@ -5481,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
                        continue;
                sg->cpumask = CPU_MASK_NONE;
-                sg->cpu_power = 0;
+                sg->__cpu_power = 0;
                for_each_cpu_mask(j, span) {
                        if (group_fn(j, cpu_map, NULL) != group)
@@ -6170,7 +6399,7 @@ next_sg:
                        continue;
                }
-                sg->cpu_power += sd->groups->cpu_power;
+                sg_inc_cpu_power(sg, sd->groups->__cpu_power);
        }
        sg = sg->next;
        if (sg != group_head)
@@ -6245,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        child = sd->child;
+        sd->groups->__cpu_power = 0;
        /*
         * For perf policy, if the groups in child domain share resources
         * (for example cores sharing some portions of the cache hierarchy
@@ -6255,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
                       (child->flags &
                        (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+                sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
                return;
        }
-        sd->groups->cpu_power = 0;
        /*
         * add cpu_power of each child group to this groups cpu_power
         */
        group = child->groups;
        do {
-                sd->groups->cpu_power += group->cpu_power;
+                sg_inc_cpu_power(sd->groups, group->__cpu_power);
                group = group->next;
        } while (group != child->groups);
 }
@@ -6426,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
                }
-                sg->cpu_power = 0;
+                sg->__cpu_power = 0;
                sg->cpumask = nodemask;
                sg->next = sg;
                cpus_or(covered, covered, nodemask);
@@ -6454,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                                "Can not alloc domain group for node %d\n", j);
                                goto error;
                        }
-                        sg->cpu_power = 0;
+                        sg->__cpu_power = 0;
                        sg->cpumask = tmp;
                        sg->next = prev->next;
                        cpus_or(covered, covered, tmp);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2b4087d545a3..1368e67c8482 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb813305..8fa7040247ad 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -34,12 +34,32 @@ static struct notifier_block panic_block = {
        .notifier_call = softlock_panic,
 };
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+        return sched_clock() >> 30;  /* 2^30 ~= 10^9 */
+}
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(touch_timestamp) = jiffies;
+        __raw_get_cpu_var(touch_timestamp) = get_timestamp();
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_all_softlockup_watchdogs(void)
+{
+        int cpu;
+        /* Cause each CPU to re-update its timestamp rather than complain */
+        for_each_online_cpu(cpu)
+                per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 /*
 * This callback runs from the timer interrupt, and checks
 * whether the watchdog thread has hung or not:
@@ -48,9 +68,18 @@ void softlockup_tick(void)
 {
        int this_cpu = smp_processor_id();
        unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+        unsigned long print_timestamp;
+        unsigned long now;
+        if (touch_timestamp == 0) {
+                touch_softlockup_watchdog();
+                return;
+        }
+        print_timestamp = per_cpu(print_timestamp, this_cpu);
-        /* prevent double reports: */
+        /* report at most once a second */
-        if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+        if (print_timestamp < (touch_timestamp + 1) ||
                did_panic ||
                        !per_cpu(watchdog_task, this_cpu))
                return;
@@ -61,12 +90,14 @@ void softlockup_tick(void)
                return;
        }
+        now = get_timestamp();
        /* Wake up the high-prio watchdog task every second: */
-        if (time_after(jiffies, touch_timestamp + HZ))
+        if (now > (touch_timestamp + 1))
                wake_up_process(per_cpu(watchdog_task, this_cpu));
        /* Warn about unreasonable 10+ seconds delays: */
-        if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+        if (now > (touch_timestamp + 10)) {
                per_cpu(print_timestamp, this_cpu) = touch_timestamp;
                spin_lock(&print_lock);
@@ -82,11 +113,14 @@ void softlockup_tick(void)
 */
 static int watchdog(void * __bind_cpu)
 {
-        struct sched_param param = { .sched_priority = 99 };
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        sched_setscheduler(current, SCHED_FIFO, &param);
        current->flags |= PF_NOFREEZE;
+        /* initialize timestamp */
+        touch_softlockup_watchdog();
        /*
         * Run briefly once per second to reset the softlockup timestamp.
         * If this gets delayed for more than 10 seconds then the
@@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        printk("watchdog for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
                }
-                per_cpu(touch_timestamp, hotcpu) = jiffies;
+                per_cpu(touch_timestamp, hotcpu) = 0;
                per_cpu(watchdog_task, hotcpu) = p;
                kthread_bind(p, hotcpu);
                break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12458040e665..daabb74ee0bc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,11 +1,12 @@
 /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
 * GPL v2 and any later version.
 */
-#include <linux/stop_machine.h>
-#include <linux/kthread.h>
-#include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
 #include <linux/syscalls.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
        return ret;
 }
+EXPORT_SYMBOL_GPL(stop_machine_run);
diff --git a/kernel/sys.c b/kernel/sys.c
index fe1f3ab20477..926bf9d7ac45 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
        if (retval)
                return retval;
+        if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+                /*
+                 * The caller is asking for an immediate RLIMIT_CPU
+                 * expiry.  But we use the zero value to mean "it was
+                 * never set".  So let's cheat and make it one second
+                 * instead
+                 */
+                new_rlim.rlim_cur = 1;
+        }
        task_lock(current->group_leader);
        *old_rlim = new_rlim;
        task_unlock(current->group_leader);
@@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
                unsigned long rlim_cur = new_rlim.rlim_cur;
                cputime_t cputime;
-                if (rlim_cur == 0) {
-                        /*
-                         * The caller is asking for an immediate RLIMIT_CPU
-                         * expiry.  But we use the zero value to mean "it was
-                         * never set".  So let's cheat and make it one second
-                         * instead
-                         */
-                        rlim_cur = 1;
-                }
                cputime = secs_to_cputime(rlim_cur);
                read_lock(&tasklist_lock);
                spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c904748f2290..f0664bd5011c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+extern int maps_protect;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -603,6 +604,16 @@ static ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_PROC_FS
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "maps_protect",
+                .data           = &maps_protect,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
        { .ctl_name = 0 }
 };
diff --git a/kernel/time.c b/kernel/time.c
index ba18ec4899bd..f04791f69408 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,7 +31,6 @@
 #include <linux/timex.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
@@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb)
 }
 EXPORT_SYMBOL(current_fs_time);
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int inline jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+        return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+        return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+unsigned int inline jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+        return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+        return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+        return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
 /**
 * timespec_trunc - Truncate timespec to a granularity
 * @t: Timespec
@@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec)
 EXPORT_SYMBOL(ns_to_timeval);
 /*
- * Convert jiffies to milliseconds and back.
- *
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
- */
-unsigned int jiffies_to_msecs(const unsigned long j)
-{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-        return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
-#else
-        return (j * MSEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_msecs);
-unsigned int jiffies_to_usecs(const unsigned long j)
-{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-        return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-        return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
-#else
-        return (j * USEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_usecs);
-/*
 * When we convert to jiffies then we interpret incoming values
 * the following way:
 *
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 93bccba1f265..99b6034fc86b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bfda3f7f0716..a96ec9ab3454 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 */
 ktime_t tick_next_period;
 ktime_t tick_period;
-static int tick_do_timer_cpu = -1;
+int tick_do_timer_cpu __read_mostly = -1;
 DEFINE_SPINLOCK(tick_device_lock);
 /*
@@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup)
                clockevents_exchange_device(dev, NULL);
                td->evtdev = NULL;
        }
+        /* Transfer the do_timer job away from this cpu */
+        if (*cpup == tick_do_timer_cpu) {
+                int cpu = first_cpu(cpu_online_map);
+                tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+        }
        spin_unlock_irqrestore(&tick_device_lock, flags);
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c9d203bde518..bb13f2724905 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
 extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 51556b95f60f..3483e6cb9549 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void)
                 * the scheduler tick in nohz_restart_sched_tick.
                 */
                if (!ts->tick_stopped) {
+                        if (select_nohz_load_balancer(1)) {
+                                /*
+                                 * sched tick not stopped!
+                                 */
+                                cpu_clear(cpu, nohz_cpu_mask);
+                                goto out;
+                        }
                        ts->idle_tick = ts->sched_timer.expires;
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                }
+                /*
+                 * If this cpu is the one which updates jiffies, then
+                 * give up the assignment and let it be taken by the
+                 * cpu which runs the tick timer next, which might be
+                 * this cpu as well. If we don't drop this here the
+                 * jiffies might be stale and do_timer() never
+                 * invoked.
+                 */
+                if (cpu == tick_do_timer_cpu)
+                        tick_do_timer_cpu = -1;
                /*
                 * calculate the expiry time for the next timer wheel
                 * timer
@@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
        now = ktime_get();
        local_irq_disable();
+        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
        cpu_clear(cpu, nohz_cpu_mask);
@@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        struct pt_regs *regs = get_irq_regs();
+        int cpu = smp_processor_id();
        ktime_t now = ktime_get();
        dev->next_event.tv64 = KTIME_MAX;
+        /*
+         * Check if the do_timer duty was dropped. We don't care about
+         * concurrency: This happens only when the cpu in charge went
+         * into a long sleep. If two cpus happen to assign themself to
+         * this duty, then the jiffies update is still serialized by
+         * xtime_lock.
+         */
+        if (unlikely(tick_do_timer_cpu == -1))
+                tick_do_timer_cpu = cpu;
        /* Check, if the jiffies need an update */
-        tick_do_update_jiffies64(now);
+        if (tick_do_timer_cpu == cpu)
+                tick_do_update_jiffies64(now);
        /*
         * When we are idle and the tick is stopped, we have to touch
@@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
        struct hrtimer_cpu_base *base = timer->base->cpu_base;
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();
+        int cpu = smp_processor_id();
+#ifdef CONFIG_NO_HZ
+        /*
+         * Check if the do_timer duty was dropped. We don't care about
+         * concurrency: This happens only when the cpu in charge went
+         * into a long sleep. If two cpus happen to assign themself to
+         * this duty, then the jiffies update is still serialized by
+         * xtime_lock.
+         */
+        if (unlikely(tick_do_timer_cpu == -1))
+                tick_do_timer_cpu = cpu;
+#endif
        /* Check, if the jiffies need an update */
-        tick_do_update_jiffies64(now);
+        if (tick_do_timer_cpu == cpu)
+                tick_do_update_jiffies64(now);
        /*
         * Do not call, when we are not in irq context and have
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 000000000000..f9217bf644f6
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,476 @@
+/*
+ *  linux/kernel/time/timekeeping.c
+ *
+ *  Kernel timekeeping code and accessor functions
+ *
+ *  This code was moved from linux/kernel/timer.c.
+ *  Please see that file for copyright and history logs.
+ *
+ */
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sysdev.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+EXPORT_SYMBOL(xtime_lock);
+/*
+ * The current time
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+EXPORT_SYMBOL(xtime);
+static struct clocksource *clock; /* pointer to current clocksource */
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+        cycle_t cycle_now, cycle_delta;
+        s64 ns_offset;
+        /* read clocksource: */
+        cycle_now = clocksource_read(clock);
+        /* calculate the delta since the last update_wall_time: */
+        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        /* convert to nanoseconds: */
+        ns_offset = cyc2ns(clock, cycle_delta);
+        return ns_offset;
+}
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+        unsigned long seq;
+        s64 nsecs;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *ts = xtime;
+                nsecs = __get_nsec_offset();
+        } while (read_seqretry(&xtime_lock, seq));
+        timespec_add_ns(ts, nsecs);
+}
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+        __get_realtime_clock_ts(ts);
+}
+EXPORT_SYMBOL(getnstimeofday);
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:         pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+        struct timespec now;
+        __get_realtime_clock_ts(&now);
+        tv->tv_sec = now.tv_sec;
+        tv->tv_usec = now.tv_nsec/1000;
+}
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:         pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+        unsigned long flags;
+        time_t wtm_sec, sec = tv->tv_sec;
+        long wtm_nsec, nsec = tv->tv_nsec;
+        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+                return -EINVAL;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        nsec -= __get_nsec_offset();
+        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+        set_normalized_timespec(&xtime, sec, nsec);
+        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+        clock->error = 0;
+        ntp_clear();
+        update_vsyscall(&xtime, clock);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        /* signal hrtimers about time change */
+        clock_was_set();
+        return 0;
+}
+EXPORT_SYMBOL(do_settimeofday);
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static void change_clocksource(void)
+{
+        struct clocksource *new;
+        cycle_t now;
+        u64 nsec;
+        new = clocksource_get_next();
+        if (clock == new)
+                return;
+        now = clocksource_read(new);
+        nsec =  __get_nsec_offset();
+        timespec_add_ns(&xtime, nsec);
+        clock = new;
+        clock->cycle_last = now;
+        clock->error = 0;
+        clock->xtime_nsec = 0;
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+        tick_clock_notify();
+        printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+               clock->name);
+}
+#else
+static inline void change_clocksource(void) { }
+#endif
+/**
+ * timekeeping_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+        unsigned long seq;
+        int ret;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+        } while (read_seqretry(&xtime_lock, seq));
+        return ret;
+}
+/**
+ * read_persistent_clock -  Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ *  XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+        return 0;
+}
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+        unsigned long flags;
+        unsigned long sec = read_persistent_clock();
+        write_seqlock_irqsave(&xtime_lock, flags);
+        ntp_clear();
+        clock = clocksource_get_next();
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+        clock->cycle_last = clocksource_read(clock);
+        xtime.tv_sec = sec;
+        xtime.tv_nsec = 0;
+        set_normalized_timespec(&wall_to_monotonic,
+                -xtime.tv_sec, -xtime.tv_nsec);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+/* flag for if timekeeping is suspended */
+static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev:        unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+        unsigned long flags;
+        unsigned long now = read_persistent_clock();
+        write_seqlock_irqsave(&xtime_lock, flags);
+        if (now && (now > timekeeping_suspend_time)) {
+                unsigned long sleep_length = now - timekeeping_suspend_time;
+                xtime.tv_sec += sleep_length;
+                wall_to_monotonic.tv_sec -= sleep_length;
+        }
+        /* re-base the last cycle value */
+        clock->cycle_last = clocksource_read(clock);
+        clock->error = 0;
+        timekeeping_suspended = 0;
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        touch_softlockup_watchdog();
+        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+        /* Resume hrtimers */
+        hres_timers_resume();
+        return 0;
+}
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_suspended = 1;
+        timekeeping_suspend_time = read_persistent_clock();
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        return 0;
+}
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+        .resume         = timekeeping_resume,
+        .suspend        = timekeeping_suspend,
+        set_kset_name("timekeeping"),
+};
+static struct sys_device device_timer = {
+        .id             = 0,
+        .cls            = &timekeeping_sysclass,
+};
+static int __init timekeeping_init_device(void)
+{
+        int error = sysdev_class_register(&timekeeping_sysclass);
+        if (!error)
+                error = sysdev_register(&device_timer);
+        return error;
+}
+device_initcall(timekeeping_init_device);
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+                                                 s64 *offset)
+{
+        s64 tick_error, i;
+        u32 look_ahead, adj;
+        s32 error2, mult;
+        /*
+         * Use the current error value to determine how much to look ahead.
+         * The larger the error the slower we adjust for it to avoid problems
+         * with losing too many ticks, otherwise we would overadjust and
+         * produce an even larger error.  The smaller the adjustment the
+         * faster we try to adjust for it, as lost ticks can do less harm
+         * here.  This is tuned so that an error of about 1 msec is adusted
+         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+         */
+        error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+        error2 = abs(error2);
+        for (look_ahead = 0; error2 > 0; look_ahead++)
+                error2 >>= 2;
+        /*
+         * Now calculate the error in (1 << look_ahead) ticks, but first
+         * remove the single look ahead already included in the error.
+         */
+        tick_error = current_tick_length() >>
+                (TICK_LENGTH_SHIFT - clock->shift + 1);
+        tick_error -= clock->xtime_interval >> 1;
+        error = ((error - tick_error) >> look_ahead) + tick_error;
+        /* Finally calculate the adjustment shift value.  */
+        i = *interval;
+        mult = 1;
+        if (error < 0) {
+                error = -error;
+                *interval = -*interval;
+                *offset = -*offset;
+                mult = -1;
+        }
+        for (adj = 0; error > i; adj++)
+                error >>= 1;
+        *interval <<= adj;
+        *offset <<= adj;
+        return mult << adj;
+}
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+        s64 error, interval = clock->cycle_interval;
+        int adj;
+        error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+        if (error > interval) {
+                error >>= 2;
+                if (likely(error <= interval))
+                        adj = 1;
+                else
+                        adj = clocksource_bigadjust(error, &interval, &offset);
+        } else if (error < -interval) {
+                error >>= 2;
+                if (likely(error >= -interval)) {
+                        adj = -1;
+                        interval = -interval;
+                        offset = -offset;
+                } else
+                        adj = clocksource_bigadjust(error, &interval, &offset);
+        } else
+                return;
+        clock->mult += adj;
+        clock->xtime_interval += interval;
+        clock->xtime_nsec -= offset;
+        clock->error -= (interval - offset) <<
+                        (TICK_LENGTH_SHIFT - clock->shift);
+}
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+void update_wall_time(void)
+{
+        cycle_t offset;
+        /* Make sure we're fully resumed: */
+        if (unlikely(timekeeping_suspended))
+                return;
+#ifdef CONFIG_GENERIC_TIME
+        offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+        offset = clock->cycle_interval;
+#endif
+        clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+        /* normally this loop will run just once, however in the
+         * case of lost or late ticks, it will accumulate correctly.
+         */
+        while (offset >= clock->cycle_interval) {
+                /* accumulate one interval */
+                clock->xtime_nsec += clock->xtime_interval;
+                clock->cycle_last += clock->cycle_interval;
+                offset -= clock->cycle_interval;
+                if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+                        clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+                        xtime.tv_sec++;
+                        second_overflow();
+                }
+                /* interpolator bits */
+                time_interpolator_update(clock->xtime_interval
+                                                >> clock->shift);
+                /* accumulate error between NTP and clock interval */
+                clock->error += current_tick_length();
+                clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+        }
+        /* correct the clock when NTP error is too big */
+        clocksource_adjust(clock, offset);
+        /* store full nanoseconds into xtime */
+        xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+        /* check to see if there is a new clocksource to use */
+        change_clocksource();
+        update_vsyscall(&xtime, clock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 59df5e8555a8..b734ca4bc75e 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 static void print_name_offset(struct seq_file *m, void *sym)
 {
-        unsigned long addr = (unsigned long)sym;
+        char symname[KSYM_NAME_LEN+1];
-        char namebuf[KSYM_NAME_LEN+1];
-        unsigned long size, offset;
+        if (lookup_symbol_name((unsigned long)sym, symname) < 0)
-        const char *sym_name;
-        char *modname;
-        sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
-        if (sym_name)
-                SEQ_printf(m, "%s", sym_name);
-        else
                SEQ_printf(m, "<%p>", sym);
+        else
+                SEQ_printf(m, "%s", symname);
 }
 static void
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1bc4882e28e0..868f1bceb07f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 static void print_name_offset(struct seq_file *m, unsigned long addr)
 {
-        char namebuf[KSYM_NAME_LEN+1];
+        char symname[KSYM_NAME_LEN+1];
-        unsigned long size, offset;
-        const char *sym_name;
+        if (lookup_symbol_name(addr, symname) < 0)
-        char *modname;
-        sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
-        if (sym_name)
-                seq_printf(m, "%s", sym_name);
-        else
                seq_printf(m, "<%p>", (void *)addr);
+        else
+                seq_printf(m, "%s", symname);
 }
 static int tstats_show(struct seq_file *m, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index b22bd39740dd..7a6448340f90 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
 /*
 *  linux/kernel/timer.c
 *
- *  Kernel internal timers, kernel timekeeping, basic process system calls
+ *  Kernel internal timers, basic process system calls
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
@@ -74,7 +74,7 @@ struct tvec_t_base_s {
        tvec_t tv3;
        tvec_t tv4;
        tvec_t tv5;
-} ____cacheline_aligned_in_smp;
+} ____cacheline_aligned;
 typedef struct tvec_t_base_s tvec_base_t;
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/*
+ * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG           (0x1)
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+{
+        return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+}
+static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+{
+        return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+}
+static inline void timer_set_deferrable(struct timer_list *timer)
+{
+        timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+                                       TBASE_DEFERRABLE_FLAG));
+}
+static inline void
+timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+{
+        timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+                                      tbase_get_deferrable(timer->base));
+}
 /**
 * __round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(init_timer);
+void fastcall init_timer_deferrable(struct timer_list *timer)
+{
+        init_timer(timer);
+        timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable);
 static inline void detach_timer(struct timer_list *timer,
                                int clear_pending)
 {
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
        tvec_base_t *base;
        for (;;) {
-                base = timer->base;
+                tvec_base_t *prelock_base = timer->base;
+                base = tbase_get_base(prelock_base);
                if (likely(base != NULL)) {
                        spin_lock_irqsave(&base->lock, *flags);
-                        if (likely(base == timer->base))
+                        if (likely(prelock_base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU */
                        spin_unlock_irqrestore(&base->lock, *flags);
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
-                        timer->base = NULL;
+                        timer_set_base(timer, NULL);
                        spin_unlock(&base->lock);
                        base = new_base;
                        spin_lock(&base->lock);
-                        timer->base = base;
+                        timer_set_base(timer, base);
                }
        }
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        timer_stats_timer_set_start_info(timer);
        BUG_ON(timer_pending(timer) || !timer->function);
        spin_lock_irqsave(&base->lock, flags);
-        timer->base = base;
+        timer_set_base(timer, base);
        internal_add_timer(base, timer);
        spin_unlock_irqrestore(&base->lock, flags);
 }
@@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
         * don't have to detach them individually.
         */
        list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-                BUG_ON(timer->base != base);
+                BUG_ON(tbase_get_base(timer->base) != base);
                internal_add_timer(base, timer);
        }
@@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base)
                        void (*fn)(unsigned long);
                        unsigned long data;
-                        timer = list_entry(head->next,struct timer_list,entry);
+                        timer = list_first_entry(head, struct timer_list,entry);
                        fn = timer->function;
                        data = timer->data;
@@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
        index = slot = timer_jiffies & TVR_MASK;
        do {
                list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+                        if (tbase_get_deferrable(nte->base))
+                                continue;
                        found = 1;
                        expires = nte->expires;
                        /* Look at the cascade bucket(s)? */
@@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void)
 #endif
-/******************************************************************/
-/* 
- * The current time 
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
- * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-EXPORT_SYMBOL(xtime);
-/* XXX - all of this timekeeping code should be later moved to time.c */
-#include <linux/clocksource.h>
-static struct clocksource *clock; /* pointer to current clocksource */
-#ifdef CONFIG_GENERIC_TIME
-/**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
- *
- * private function, must hold xtime_lock lock when being
- * called. Returns the number of nanoseconds since the
- * last call to update_wall_time() (adjusted by NTP scaling)
- */
-static inline s64 __get_nsec_offset(void)
-{
-        cycle_t cycle_now, cycle_delta;
-        s64 ns_offset;
-        /* read clocksource: */
-        cycle_now = clocksource_read(clock);
-        /* calculate the delta since the last update_wall_time: */
-        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        /* convert to nanoseconds: */
-        ns_offset = cyc2ns(clock, cycle_delta);
-        return ns_offset;
-}
-/**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
- * @ts:         pointer to the timespec to be set
- *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
- */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
-{
-        unsigned long seq;
-        s64 nsecs;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                *ts = xtime;
-                nsecs = __get_nsec_offset();
-        } while (read_seqretry(&xtime_lock, seq));
-        timespec_add_ns(ts, nsecs);
-}
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts:         pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
-        __get_realtime_clock_ts(ts);
-}
-EXPORT_SYMBOL(getnstimeofday);
-/**
- * do_gettimeofday - Returns the time of day in a timeval
- * @tv:         pointer to the timeval to be set
- *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
- */
-void do_gettimeofday(struct timeval *tv)
-{
-        struct timespec now;
-        __get_realtime_clock_ts(&now);
-        tv->tv_sec = now.tv_sec;
-        tv->tv_usec = now.tv_nsec/1000;
-}
-EXPORT_SYMBOL(do_gettimeofday);
-/**
- * do_settimeofday - Sets the time of day
- * @tv:         pointer to the timespec variable containing the new time
- *
- * Sets the time of day to the new time and update NTP and notify hrtimers
- */
-int do_settimeofday(struct timespec *tv)
-{
-        unsigned long flags;
-        time_t wtm_sec, sec = tv->tv_sec;
-        long wtm_nsec, nsec = tv->tv_nsec;
-        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-                return -EINVAL;
-        write_seqlock_irqsave(&xtime_lock, flags);
-        nsec -= __get_nsec_offset();
-        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-        set_normalized_timespec(&xtime, sec, nsec);
-        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-        clock->error = 0;
-        ntp_clear();
-        update_vsyscall(&xtime, clock);
-        write_sequnlock_irqrestore(&xtime_lock, flags);
-        /* signal hrtimers about time change */
-        clock_was_set();
-        return 0;
-}
-EXPORT_SYMBOL(do_settimeofday);
-/**
- * change_clocksource - Swaps clocksources if a new one is available
- *
- * Accumulates current time interval and initializes new clocksource
- */
-static void change_clocksource(void)
-{
-        struct clocksource *new;
-        cycle_t now;
-        u64 nsec;
-        new = clocksource_get_next();
-        if (clock == new)
-                return;
-        now = clocksource_read(new);
-        nsec =  __get_nsec_offset();
-        timespec_add_ns(&xtime, nsec);
-        clock = new;
-        clock->cycle_last = now;
-        clock->error = 0;
-        clock->xtime_nsec = 0;
-        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-        tick_clock_notify();
-        printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-               clock->name);
-}
-#else
-static inline void change_clocksource(void) { }
-#endif
-/**
- * timekeeping_is_continuous - check to see if timekeeping is free running
- */
-int timekeeping_is_continuous(void)
-{
-        unsigned long seq;
-        int ret;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&xtime_lock, seq));
-        return ret;
-}
-/**
- * read_persistent_clock -  Return time in seconds from the persistent clock.
- *
- * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
- *
- *  XXX - Do be sure to remove it once all arches implement it.
- */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
-{
-        return 0;
-}
-/*
- * timekeeping_init - Initializes the clocksource and common timekeeping values
- */
-void __init timekeeping_init(void)
-{
-        unsigned long flags;
-        unsigned long sec = read_persistent_clock();
-        write_seqlock_irqsave(&xtime_lock, flags);
-        ntp_clear();
-        clock = clocksource_get_next();
-        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-        clock->cycle_last = clocksource_read(clock);
-        xtime.tv_sec = sec;
-        xtime.tv_nsec = 0;
-        set_normalized_timespec(&wall_to_monotonic,
-                -xtime.tv_sec, -xtime.tv_nsec);
-        write_sequnlock_irqrestore(&xtime_lock, flags);
-}
-/* flag for if timekeeping is suspended */
-static int timekeeping_suspended;
-/* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
-/**
- * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev:        unused
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
- */
-static int timekeeping_resume(struct sys_device *dev)
-{
-        unsigned long flags;
-        unsigned long now = read_persistent_clock();
-        write_seqlock_irqsave(&xtime_lock, flags);
-        if (now && (now > timekeeping_suspend_time)) {
-                unsigned long sleep_length = now - timekeeping_suspend_time;
-                xtime.tv_sec += sleep_length;
-                wall_to_monotonic.tv_sec -= sleep_length;
-        }
-        /* re-base the last cycle value */
-        clock->cycle_last = clocksource_read(clock);
-        clock->error = 0;
-        timekeeping_suspended = 0;
-        write_sequnlock_irqrestore(&xtime_lock, flags);
-        touch_softlockup_watchdog();
-        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-        /* Resume hrtimers */
-        hres_timers_resume();
-        return 0;
-}
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
-{
-        unsigned long flags;
-        write_seqlock_irqsave(&xtime_lock, flags);
-        timekeeping_suspended = 1;
-        timekeeping_suspend_time = read_persistent_clock();
-        write_sequnlock_irqrestore(&xtime_lock, flags);
-        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-        return 0;
-}
-/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
-        .resume         = timekeeping_resume,
-        .suspend        = timekeeping_suspend,
-        set_kset_name("timekeeping"),
-};
-static struct sys_device device_timer = {
-        .id             = 0,
-        .cls            = &timekeeping_sysclass,
-};
-static int __init timekeeping_init_device(void)
-{
-        int error = sysdev_class_register(&timekeeping_sysclass);
-        if (!error)
-                error = sysdev_register(&device_timer);
-        return error;
-}
-device_initcall(timekeeping_init_device);
-/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
-                                                 s64 *offset)
-{
-        s64 tick_error, i;
-        u32 look_ahead, adj;
-        s32 error2, mult;
-        /*
-         * Use the current error value to determine how much to look ahead.
-         * The larger the error the slower we adjust for it to avoid problems
-         * with losing too many ticks, otherwise we would overadjust and
-         * produce an even larger error.  The smaller the adjustment the
-         * faster we try to adjust for it, as lost ticks can do less harm
-         * here.  This is tuned so that an error of about 1 msec is adusted
-         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
-         */
-        error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
-        error2 = abs(error2);
-        for (look_ahead = 0; error2 > 0; look_ahead++)
-                error2 >>= 2;
-        /*
-         * Now calculate the error in (1 << look_ahead) ticks, but first
-         * remove the single look ahead already included in the error.
-         */
-        tick_error = current_tick_length() >>
-                (TICK_LENGTH_SHIFT - clock->shift + 1);
-        tick_error -= clock->xtime_interval >> 1;
-        error = ((error - tick_error) >> look_ahead) + tick_error;
-        /* Finally calculate the adjustment shift value.  */
-        i = *interval;
-        mult = 1;
-        if (error < 0) {
-                error = -error;
-                *interval = -*interval;
-                *offset = -*offset;
-                mult = -1;
-        }
-        for (adj = 0; error > i; adj++)
-                error >>= 1;
-        *interval <<= adj;
-        *offset <<= adj;
-        return mult << adj;
-}
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void clocksource_adjust(struct clocksource *clock, s64 offset)
-{
-        s64 error, interval = clock->cycle_interval;
-        int adj;
-        error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
-        if (error > interval) {
-                error >>= 2;
-                if (likely(error <= interval))
-                        adj = 1;
-                else
-                        adj = clocksource_bigadjust(error, &interval, &offset);
-        } else if (error < -interval) {
-                error >>= 2;
-                if (likely(error >= -interval)) {
-                        adj = -1;
-                        interval = -interval;
-                        offset = -offset;
-                } else
-                        adj = clocksource_bigadjust(error, &interval, &offset);
-        } else
-                return;
-        clock->mult += adj;
-        clock->xtime_interval += interval;
-        clock->xtime_nsec -= offset;
-        clock->error -= (interval - offset) <<
-                        (TICK_LENGTH_SHIFT - clock->shift);
-}
-/**
- * update_wall_time - Uses the current clocksource to increment the wall time
- *
- * Called from the timer interrupt, must hold a write on xtime_lock.
- */
-static void update_wall_time(void)
-{
-        cycle_t offset;
-        /* Make sure we're fully resumed: */
-        if (unlikely(timekeeping_suspended))
-                return;
-#ifdef CONFIG_GENERIC_TIME
-        offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
-#else
-        offset = clock->cycle_interval;
-#endif
-        clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
-        /* normally this loop will run just once, however in the
-         * case of lost or late ticks, it will accumulate correctly.
-         */
-        while (offset >= clock->cycle_interval) {
-                /* accumulate one interval */
-                clock->xtime_nsec += clock->xtime_interval;
-                clock->cycle_last += clock->cycle_interval;
-                offset -= clock->cycle_interval;
-                if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
-                        clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
-                        xtime.tv_sec++;
-                        second_overflow();
-                }
-                /* interpolator bits */
-                time_interpolator_update(clock->xtime_interval
-                                                >> clock->shift);
-                /* accumulate error between NTP and clock interval */
-                clock->error += current_tick_length();
-                clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
-        }
-        /* correct the clock when NTP error is too big */
-        clocksource_adjust(clock, offset);
-        /* store full nanoseconds into xtime */
-        xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
-        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
-        /* check to see if there is a new clocksource to use */
-        change_clocksource();
-        update_vsyscall(&xtime, clock);
-}
 /*
 * Called from the timer interrupt handler to charge one tick to the current 
 * process.  user_tick is 1 if the tick is user time, 0 for system.
@@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks)
 }
 /*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
- */
-__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-EXPORT_SYMBOL(xtime_lock);
-/*
 * This function runs timers and the timer-tq in bottom half context.
 */
 static void run_timer_softirq(struct softirq_action *h)
@@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu)
                                                cpu_to_node(cpu));
                        if (!base)
                                return -ENOMEM;
+                        /* Make sure that tvec_base is 2 byte aligned */
+                        if (tbase_get_deferrable(base)) {
+                                WARN_ON(1);
+                                kfree(base);
+                                return -ENOMEM;
+                        }
                        memset(base, 0, sizeof(*base));
                        per_cpu(tvec_bases, cpu) = base;
                } else {
@@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
        struct timer_list *timer;
        while (!list_empty(head)) {
-                timer = list_entry(head->next, struct timer_list, entry);
+                timer = list_first_entry(head, struct timer_list, entry);
                detach_timer(timer, 0);
-                timer->base = new_base;
+                timer_set_base(timer, new_base);
                internal_add_timer(new_base, timer);
        }
 }
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 187e2a423878..dd308ba4e03b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -6,7 +6,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index c859164a6993..160c8c5136bd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 }
 /*
- * unshare the current process' utsname namespace.
- * called only in sys_unshare()
- */
-int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
-{
-        if (unshare_flags & CLONE_NEWUTS) {
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
-                if (!*new_uts)
-                        return -ENOMEM;
-        }
-        return 0;
-}
-/*
 * Copy task tsk's utsname namespace, or clone it if flags
 * specifies CLONE_NEWUTS.  In latter case, changes to the
 * utsname of this process won't be seen by parent, and vice
 * versa.
 */
-int copy_utsname(int flags, struct task_struct *tsk)
+struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
 {
-        struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
        struct uts_namespace *new_ns;
-        int err = 0;
-        if (!old_ns)
-                return 0;
+        BUG_ON(!old_ns);
        get_uts_ns(old_ns);
        if (!(flags & CLONE_NEWUTS))
-                return 0;
+                return old_ns;
-        if (!capable(CAP_SYS_ADMIN)) {
-                err = -EPERM;
-                goto out;
-        }
        new_ns = clone_uts_ns(old_ns);
-        if (!new_ns) {
-                err = -ENOMEM;
-                goto out;
-        }
-        tsk->nsproxy->uts_ns = new_ns;
-out:
        put_uts_ns(old_ns);
-        return err;
+        return new_ns;
 }
 void free_uts_ns(struct kref *kref)