72 files changed, 5384 insertions, 2979 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..0b5ff083fa22 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,8 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o range.o
+            async.o range.o jump_label.o
-obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -86,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
 obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..291ba3d04bea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
         * is called after synchronize_rcu(). But for safe use, css_is_removed()
         * css_tryget() should be used for avoiding race.
         */
-        struct cgroup_subsys_state *css;
+        struct cgroup_subsys_state __rcu *css;
        /*
         * ID of this css.
         */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..51b143e2a07a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
-        ret = security_task_setscheduler(tsk, 0, NULL);
+        ret = security_task_setscheduler(tsk);
        if (ret)
                return ret;
        if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        ret = security_task_setscheduler(c, 0, NULL);
+                        ret = security_task_setscheduler(c);
                        if (ret) {
                                rcu_read_unlock();
                                return ret;
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * early_res, could be used to replace bootmem
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/early_res.h>
-#include <linux/slab.h>
-#include <linux/kmemleak.h>
-/*
- * Early reserved memory areas.
- */
-/*
- * need to make sure this one is bigger enough before
- * find_fw_memmap_area could be used
- */
-#define MAX_EARLY_RES_X 32
-struct early_res {
-        u64 start, end;
-        char name[15];
-        char overlap_ok;
-};
-static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
-static int max_early_res __initdata = MAX_EARLY_RES_X;
-static struct early_res *early_res __initdata = &early_res_x[0];
-static int early_res_count __initdata;
-static int __init find_overlapped_early(u64 start, u64 end)
-{
-        int i;
-        struct early_res *r;
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                r = &early_res[i];
-                if (end > r->start && start < r->end)
-                        break;
-        }
-        return i;
-}
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
-        int j;
-        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
-                ;
-        memmove(&early_res[i], &early_res[i + 1],
-               (j - 1 - i) * sizeof(struct early_res));
-        early_res[j - 1].end = 0;
-        early_res_count--;
-}
-static void __init drop_range_partial(int i, u64 start, u64 end)
-{
-        u64 common_start, common_end;
-        u64 old_start, old_end;
-        old_start = early_res[i].start;
-        old_end = early_res[i].end;
-        common_start = max(old_start, start);
-        common_end = min(old_end, end);
-        /* no overlap ? */
-        if (common_start >= common_end)
-                return;
-        if (old_start < common_start) {
-                /* make head segment */
-                early_res[i].end = common_start;
-                if (old_end > common_end) {
-                        char name[15];
-                        /*
-                         * Save a local copy of the name, since the
-                         * early_res array could get resized inside
-                         * reserve_early_without_check() ->
-                         * __check_and_double_early_res(), which would
-                         * make the current name pointer invalid.
-                         */
-                        strncpy(name, early_res[i].name,
-                                         sizeof(early_res[i].name) - 1);
-                        /* add another for left over on tail */
-                        reserve_early_without_check(common_end, old_end, name);
-                }
-                return;
-        } else {
-                if (old_end > common_end) {
-                        /* reuse the entry for tail left */
-                        early_res[i].start = common_end;
-                        return;
-                }
-                /* all covered */
-                drop_range(i);
-        }
-}
-/*
- * Split any existing ranges that:
- *  1) are marked 'overlap_ok', and
- *  2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range.  Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
- */
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
-{
-        int i;
-        struct early_res *r;
-        u64 lower_start, lower_end;
-        u64 upper_start, upper_end;
-        char name[15];
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                r = &early_res[i];
-                /* Continue past non-overlapping ranges */
-                if (end <= r->start || start >= r->end)
-                        continue;
-                /*
-                 * Leave non-ok overlaps as is; let caller
-                 * panic "Overlapping early reservations"
-                 * when it hits this overlap.
-                 */
-                if (!r->overlap_ok)
-                        return;
-                /*
-                 * We have an ok overlap.  We will drop it from the early
-                 * reservation map, and add back in any non-overlapping
-                 * portions (lower or upper) as separate, overlap_ok,
-                 * non-overlapping ranges.
-                 */
-                /* 1. Note any non-overlapping (lower or upper) ranges. */
-                strncpy(name, r->name, sizeof(name) - 1);
-                lower_start = lower_end = 0;
-                upper_start = upper_end = 0;
-                if (r->start < start) {
-                        lower_start = r->start;
-                        lower_end = start;
-                }
-                if (r->end > end) {
-                        upper_start = end;
-                        upper_end = r->end;
-                }
-                /* 2. Drop the original ok overlapping range */
-                drop_range(i);
-                i--;            /* resume for-loop on copied down entry */
-                /* 3. Add back in any non-overlapping ranges. */
-                if (lower_end)
-                        reserve_early_overlap_ok(lower_start, lower_end, name);
-                if (upper_end)
-                        reserve_early_overlap_ok(upper_start, upper_end, name);
-        }
-}
-static void __init __reserve_early(u64 start, u64 end, char *name,
-                                                int overlap_ok)
-{
-        int i;
-        struct early_res *r;
-        i = find_overlapped_early(start, end);
-        if (i >= max_early_res)
-                panic("Too many early reservations");
-        r = &early_res[i];
-        if (r->end)
-                panic("Overlapping early reservations "
-                      "%llx-%llx %s to %llx-%llx %s\n",
-                      start, end - 1, name ? name : "", r->start,
-                      r->end - 1, r->name);
-        r->start = start;
-        r->end = end;
-        r->overlap_ok = overlap_ok;
-        if (name)
-                strncpy(r->name, name, sizeof(r->name) - 1);
-        early_res_count++;
-}
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first.  We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
-        drop_overlaps_that_are_ok(start, end);
-        __reserve_early(start, end, name, 1);
-}
-static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
-{
-        u64 start, end, size, mem;
-        struct early_res *new;
-        /* do we have enough slots left ? */
-        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
-                return;
-        /* double it */
-        mem = -1ULL;
-        size = sizeof(struct early_res) * max_early_res * 2;
-        if (early_res == early_res_x)
-                start = 0;
-        else
-                start = early_res[0].end;
-        end = ex_start;
-        if (start + size < end)
-                mem = find_fw_memmap_area(start, end, size,
-                                         sizeof(struct early_res));
-        if (mem == -1ULL) {
-                start = ex_end;
-                end = get_max_mapped();
-                if (start + size < end)
-                        mem = find_fw_memmap_area(start, end, size,
-                                                 sizeof(struct early_res));
-        }
-        if (mem == -1ULL)
-                panic("can not find more space for early_res array");
-        new = __va(mem);
-        /* save the first one for own */
-        new[0].start = mem;
-        new[0].end = mem + size;
-        new[0].overlap_ok = 0;
-        /* copy old to new */
-        if (early_res == early_res_x) {
-                memcpy(&new[1], &early_res[0],
-                         sizeof(struct early_res) * max_early_res);
-                memset(&new[max_early_res+1], 0,
-                         sizeof(struct early_res) * (max_early_res - 1));
-                early_res_count++;
-        } else {
-                memcpy(&new[1], &early_res[1],
-                         sizeof(struct early_res) * (max_early_res - 1));
-                memset(&new[max_early_res], 0,
-                         sizeof(struct early_res) * max_early_res);
-        }
-        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
-        early_res = new;
-        max_early_res *= 2;
-        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
-                max_early_res, mem, mem + size - 1);
-}
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
-        if (start >= end)
-                return;
-        __check_and_double_early_res(start, end);
-        drop_overlaps_that_are_ok(start, end);
-        __reserve_early(start, end, name, 0);
-}
-void __init reserve_early_without_check(u64 start, u64 end, char *name)
-{
-        struct early_res *r;
-        if (start >= end)
-                return;
-        __check_and_double_early_res(start, end);
-        r = &early_res[early_res_count];
-        r->start = start;
-        r->end = end;
-        r->overlap_ok = 0;
-        if (name)
-                strncpy(r->name, name, sizeof(r->name) - 1);
-        early_res_count++;
-}
-void __init free_early(u64 start, u64 end)
-{
-        struct early_res *r;
-        int i;
-        kmemleak_free_part(__va(start), end - start);
-        i = find_overlapped_early(start, end);
-        r = &early_res[i];
-        if (i >= max_early_res || r->end != end || r->start != start)
-                panic("free_early on not reserved area: %llx-%llx!",
-                         start, end - 1);
-        drop_range(i);
-}
-void __init free_early_partial(u64 start, u64 end)
-{
-        struct early_res *r;
-        int i;
-        kmemleak_free_part(__va(start), end - start);
-        if (start == end)
-                return;
-        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
-                return;
-try_next:
-        i = find_overlapped_early(start, end);
-        if (i >= max_early_res)
-                return;
-        r = &early_res[i];
-        /* hole ? */
-        if (r->end >= end && r->start <= start) {
-                drop_range_partial(i, start, end);
-                return;
-        }
-        drop_range_partial(i, start, end);
-        goto try_next;
-}
-#ifdef CONFIG_NO_BOOTMEM
-static void __init subtract_early_res(struct range *range, int az)
-{
-        int i, count;
-        u64 final_start, final_end;
-        int idx = 0;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        /* need to skip first one ?*/
-        if (early_res != early_res_x)
-                idx = 1;
-#define DEBUG_PRINT_EARLY_RES 1
-#if DEBUG_PRINT_EARLY_RES
-        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
-#endif
-        for (i = idx; i < count; i++) {
-                struct early_res *r = &early_res[i];
-#if DEBUG_PRINT_EARLY_RES
-                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
-                        r->start, r->end, r->name);
-#endif
-                final_start = PFN_DOWN(r->start);
-                final_end = PFN_UP(r->end);
-                if (final_start >= final_end)
-                        continue;
-                subtract_range(range, az, final_start, final_end);
-        }
-}
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
-        int i, count;
-        u64 start = 0, end;
-        u64 size;
-        u64 mem;
-        struct range *range;
-        int nr_range;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        count *= 2;
-        size = sizeof(struct range) * count;
-        end = get_max_mapped();
-#ifdef MAX_DMA32_PFN
-        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
-                start = MAX_DMA32_PFN << PAGE_SHIFT;
-#endif
-        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
-        if (mem == -1ULL)
-                panic("can not find more space for range free");
-        range = __va(mem);
-        /* use early_node_map[] and early_res to get range array at first */
-        memset(range, 0, size);
-        nr_range = 0;
-        /* need to go over early_node_map to find out good range for node */
-        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
-#ifdef CONFIG_X86_32
-        subtract_range(range, count, max_low_pfn, -1ULL);
-#endif
-        subtract_early_res(range, count);
-        nr_range = clean_sort_range(range, count);
-        /* need to clear it ? */
-        if (nodeid == MAX_NUMNODES) {
-                memset(&early_res[0], 0,
-                         sizeof(struct early_res) * max_early_res);
-                early_res = NULL;
-                max_early_res = 0;
-        }
-        *rangep = range;
-        return nr_range;
-}
-#else
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
-        int i, count;
-        u64 final_start, final_end;
-        int idx = 0;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        /* need to skip first one ?*/
-        if (early_res != early_res_x)
-                idx = 1;
-        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
-                         count - idx, max_early_res, start, end);
-        for (i = idx; i < count; i++) {
-                struct early_res *r = &early_res[i];
-                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
-                        r->start, r->end, r->name);
-                final_start = max(start, r->start);
-                final_end = min(end, r->end);
-                if (final_start >= final_end) {
-                        printk(KERN_CONT "\n");
-                        continue;
-                }
-                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
-                        final_start, final_end);
-                reserve_bootmem_generic(final_start, final_end - final_start,
-                                BOOTMEM_DEFAULT);
-        }
-        /* clear them */
-        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
-        early_res = NULL;
-        max_early_res = 0;
-        early_res_count = 0;
-}
-#endif
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
-        int i;
-        u64 addr = *addrp;
-        int changed = 0;
-        struct early_res *r;
-again:
-        i = find_overlapped_early(addr, addr + size);
-        r = &early_res[i];
-        if (i < max_early_res && r->end) {
-                *addrp = addr = round_up(r->end, align);
-                changed = 1;
-                goto again;
-        }
-        return changed;
-}
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
-{
-        int i;
-        u64 addr = *addrp, last;
-        u64 size = *sizep;
-        int changed = 0;
-again:
-        last = addr + size;
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                struct early_res *r = &early_res[i];
-                if (last > r->start && addr < r->start) {
-                        size = r->start - addr;
-                        changed = 1;
-                        goto again;
-                }
-                if (last > r->end && addr < r->end) {
-                        addr = round_up(r->end, align);
-                        size = last - addr;
-                        changed = 1;
-                        goto again;
-                }
-                if (last <= r->end && addr >= r->start) {
-                        (*sizep)++;
-                        return 0;
-                }
-        }
-        if (changed) {
-                *addrp = addr;
-                *sizep = size;
-        }
-        return changed;
-}
-/*
- * Find a free area with specified alignment in a specific range.
- * only with the area.between start to end is active range from early_node_map
- * so they are good as RAM
- */
-u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
-                         u64 size, u64 align)
-{
-        u64 addr, last;
-        addr = round_up(ei_start, align);
-        if (addr < start)
-                addr = round_up(start, align);
-        if (addr >= ei_last)
-                goto out;
-        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
-                ;
-        last = addr + size;
-        if (last > ei_last)
-                goto out;
-        if (last > end)
-                goto out;
-        return addr;
-out:
-        return -1ULL;
-}
-u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
-                         u64 *sizep, u64 align)
-{
-        u64 addr, last;
-        addr = round_up(ei_start, align);
-        if (addr < start)
-                addr = round_up(start, align);
-        if (addr >= ei_last)
-                goto out;
-        *sizep = ei_last - addr;
-        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
-                ;
-        last = addr + *sizep;
-        if (last > ei_last)
-                goto out;
-        return addr;
-out:
-        return -1ULL;
-}
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db28..e2bdf37f9fde 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-#ifdef CONFIG_PERF_EVENTS
+        perf_event_delayed_put(tsk);
-        WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..a118bf160e0b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
 /**
 * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list:               priority-sorted list of tasks waiting on this futex
 * @task:               the task waiting on the futex
 * @lock_ptr:           the hash bucket lock
 * @key:                the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                         */
                        pi_state = this->pi_state;
                        /*
-                         * Userspace might have messed up non PI and PI futexes
+                         * Userspace might have messed up non-PI and PI futexes
                         */
                        if (unlikely(!pi_state))
                                return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
        /*
         * We set q->lock_ptr = NULL _before_ we wake up the task. If
-         * a non futex wake up happens on another CPU then the task
+         * a non-futex wake up happens on another CPU then the task
-         * might exit and p would dereference a non existing task
+         * might exit and p would dereference a non-existing task
         * struct. Prevent this by holding a reference on p across the
         * wake up.
         */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1:      source futex user address
+ * @uaddr1:     source futex user address
- * uaddr2:      target futex user address
+ * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
- * nr_wake:     number of waiters to wake (must be 1 for requeue_pi)
+ * @uaddr2:     target futex user address
- * nr_requeue:  number of waiters to requeue (0-INT_MAX)
+ * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
- * requeue_pi:  if we are attempting to requeue from a non-pi futex to a
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval:     @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
 *              pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
 /* The key must be already stored in q->key. */
 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+        __acquires(&hb->lock)
 {
        struct futex_hash_bucket *hb;
-        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
-        drop_futex_key_refs(&q->key);
 }
 /**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 * an example).
 */
 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
 {
        int prio;
@@ -1471,6 +1475,7 @@ retry:
 * and dropped here.
 */
 static void unqueue_me_pi(struct futex_q *q)
+        __releases(q->lock_ptr)
 {
        WARN_ON(plist_node_empty(&q->list));
        plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
        q->pi_state = NULL;
        spin_unlock(q->lock_ptr);
-        drop_futex_key_refs(&q->key);
 }
 /*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
        }
 retry:
-        /* Prepare to wait on uaddr. */
+        /*
+         * Prepare to wait on uaddr. On success, holds hb lock and increments
+         * q.key refs.
+         */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
                goto out;
@@ -1822,28 +1828,27 @@ retry:
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
+        /* unqueue_me() drops q.key ref */
        if (!unqueue_me(&q))
-                goto out_put_key;
+                goto out;
        ret = -ETIMEDOUT;
        if (to && !to->task)
-                goto out_put_key;
+                goto out;
        /*
         * We expect signal_pending(current), but we might be the
         * victim of a spurious wakeup as well.
         */
-        if (!signal_pending(current)) {
+        if (!signal_pending(current))
-                put_futex_key(fshared, &q.key);
                goto retry;
-        }
        ret = -ERESTARTSYS;
        if (!abs_time)
-                goto out_put_key;
+                goto out;
        restart = &current_thread_info()->restart_block;
        restart->fn = futex_wait_restart;
-        restart->futex.uaddr = (u32 *)uaddr;
+        restart->futex.uaddr = uaddr;
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
        ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
-        put_futex_key(fshared, &q.key);
 out:
        if (to) {
                hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
-        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+        u32 __user *uaddr = restart->futex.uaddr;
        int fshared = 0;
        ktime_t t, *tp = NULL;
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
-        /* Prepare to wait on uaddr. */
+        /*
+         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+         * count.
+         */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * In order for us to be here, we know our q.key == key2, and since
         * we took the hb->lock above, we also know that futex_requeue() has
         * completed and we no longer have to concern ourselves with a wakeup
-         * race with the atomic proxy lock acquition by the requeue code.
+         * race with the atomic proxy lock acquisition by the requeue code. The
+         * futex_requeue dropped our key1 reference and incremented our key2
+         * reference count.
         */
        /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
 */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
                                     struct robust_list __user * __user *head,
-                                     int *pi)
+                                     unsigned int *pi)
 {
        unsigned long uentry;
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
         * of the complex code paths. Also we want to prevent
         * registration of robust lists in that case. NULL is
         * guaranteed to fault and we get -EFAULT on functional
-         * implementation, the non functional ones will return
+         * implementation, the non-functional ones will return
         * -ENOSYS.
         */
        curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
 */
 static inline int
 fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-                   compat_uptr_t __user *head, int *pi)
+                   compat_uptr_t __user *head, unsigned int *pi)
 {
        if (get_user(*uentry, head))
                return -EFAULT;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                        " disables this message.\n");
        sched_show_task(t);
-        __debug_show_held_locks(t);
+        debug_show_held_locks(t);
        touch_nmi_watchdog();
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 * periodically exit the critical section and enter a new one.
 *
 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
 */
 static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
 {
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c7c2aed9e2dc..2c9120f0afca 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 */
 static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-        struct perf_event_context *ctx = bp->ctx;
+        struct task_struct *tsk = bp->hw.bp_target;
        struct perf_event *iter;
        int count = 0;
        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                if (iter->ctx == ctx && find_slot_idx(iter) == type)
+                if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
                        count += hw_breakpoint_weight(iter);
        }
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                    enum bp_type_idx type)
 {
        int cpu = bp->cpu;
-        struct task_struct *tsk = bp->ctx->task;
+        struct task_struct *tsk = bp->hw.bp_target;
        if (cpu >= 0) {
                slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
               int weight)
 {
        int cpu = bp->cpu;
-        struct task_struct *tsk = bp->ctx->task;
+        struct task_struct *tsk = bp->hw.bp_target;
        /* Pinned counter cpu profiling */
        if (!tsk) {
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
                            struct task_struct *tsk)
 {
-        return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
+        return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
-                                                triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-                bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+                bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
                *pevent = bp;
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
        .priority = 0x7fffffff
 };
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+        release_bp_slot(event);
+}
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+        int err;
+        if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+                return -ENOENT;
+        err = register_perf_hw_breakpoint(bp);
+        if (err)
+                return err;
+        bp->destroy = bp_perf_event_destroy;
+        return 0;
+}
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+        if (!(flags & PERF_EF_START))
+                bp->hw.state = PERF_HES_STOPPED;
+        return arch_install_hw_breakpoint(bp);
+}
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+        arch_uninstall_hw_breakpoint(bp);
+}
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+        bp->hw.state = 0;
+}
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+        bp->hw.state = PERF_HES_STOPPED;
+}
+static struct pmu perf_breakpoint = {
+        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
+        .event_init     = hw_breakpoint_event_init,
+        .add            = hw_breakpoint_add,
+        .del            = hw_breakpoint_del,
+        .start          = hw_breakpoint_start,
+        .stop           = hw_breakpoint_stop,
+        .read           = hw_breakpoint_pmu_read,
+};
 static int __init init_hw_breakpoint(void)
 {
        unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
        constraints_initialized = 1;
+        perf_pmu_register(&perf_breakpoint);
        return register_die_notifier(&hw_breakpoint_exceptions_nb);
 err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
 core_initcall(init_hw_breakpoint);
-struct pmu perf_ops_bp = {
-        .enable         = arch_install_hw_breakpoint,
-        .disable        = arch_uninstall_hw_breakpoint,
-        .read           = hw_breakpoint_pmu_read,
-};
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..31d766bf5d2e
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
+config HAVE_GENERIC_HARDIRQS
+        def_bool n
+if HAVE_GENERIC_HARDIRQS
+menu "IRQ subsystem"
+#
+# Interrupt subsystem related configuration options
+#
+config GENERIC_HARDIRQS
+       def_bool y
+config GENERIC_HARDIRQS_NO__DO_IRQ
+       def_bool y
+# Select this to disable the deprecated stuff
+config GENERIC_HARDIRQS_NO_DEPRECATED
+       def_bool n
+# Options selectable by the architecture code
+config HAVE_SPARSE_IRQ
+       def_bool n
+config GENERIC_IRQ_PROBE
+        def_bool n
+config GENERIC_PENDING_IRQ
+        def_bool n
+config AUTO_IRQ_AFFINITY
+       def_bool n
+config IRQ_PER_CPU
+       def_bool n
+config HARDIRQS_SW_RESEND
+       def_bool n
+config SPARSE_IRQ
+        bool "Support sparse irq numbering"
+        depends on HAVE_SPARSE_IRQ
+        ---help---
+          Sparse irq numbering is useful for distro kernels that want
+          to define a high CONFIG_NR_CPUS value but still want to have
+          low kernel memory footprint on smaller machines.
+          ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
+            out the interrupt descriptors in a more NUMA-friendly way. )
+          If you don't know what to do here, say N.
+endmenu
+endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..54329cd7b3ee 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
-obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..505798f86c36 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
                         * Some chips need to know about probing in
                         * progress:
                         */
-                        if (desc->chip->set_type)
+                        if (desc->irq_data.chip->irq_set_type)
-                                desc->chip->set_type(i, IRQ_TYPE_PROBE);
+                                desc->irq_data.chip->irq_set_type(&desc->irq_data,
-                        desc->chip->startup(i);
+                                                         IRQ_TYPE_PROBE);
+                        desc->irq_data.chip->irq_startup(&desc->irq_data);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-                        if (desc->chip->startup(i))
+                        if (desc->irq_data.chip->irq_startup(&desc->irq_data))
                                desc->status |= IRQ_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
                        /* It triggered already - consider it spurious. */
                        if (!(status & IRQ_WAITING)) {
                                desc->status = status & ~IRQ_AUTODETECT;
-                                desc->chip->shutdown(i);
+                                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                        } else
                                if (i < 32)
                                        mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
                                mask |= 1 << i;
                        desc->status = status & ~IRQ_AUTODETECT;
-                        desc->chip->shutdown(i);
+                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
                                nr_of_irqs++;
                        }
                        desc->status = status & ~IRQ_AUTODETECT;
-                        desc->chip->shutdown(i);
+                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..baa5c4acad83 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
 #include "internals.h"
-static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
-{
-        struct irq_desc *desc;
-        unsigned long flags;
-        desc = irq_to_desc(irq);
-        if (!desc) {
-                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-                return;
-        }
-        /* Ensure we don't have left over values from a previous use of this irq */
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status = IRQ_DISABLED;
-        desc->chip = &no_irq_chip;
-        desc->handle_irq = handle_bad_irq;
-        desc->depth = 1;
-        desc->msi_desc = NULL;
-        desc->handler_data = NULL;
-        if (!keep_chip_data)
-                desc->chip_data = NULL;
-        desc->action = NULL;
-        desc->irq_count = 0;
-        desc->irqs_unhandled = 0;
-#ifdef CONFIG_SMP
-        cpumask_setall(desc->affinity);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-        cpumask_clear(desc->pending_mask);
-#endif
-#endif
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-/**
- *      dynamic_irq_init - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
-{
-        dynamic_irq_init_x(irq, false);
-}
-/**
- *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- *
- *      does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_init_keep_chip_data(unsigned int irq)
-{
-        dynamic_irq_init_x(irq, true);
-}
-static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc) {
-                WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-                return;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        if (desc->action) {
-                raw_spin_unlock_irqrestore(&desc->lock, flags);
-                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
-                        irq);
-                return;
-        }
-        desc->msi_desc = NULL;
-        desc->handler_data = NULL;
-        if (!keep_chip_data)
-                desc->chip_data = NULL;
-        desc->handle_irq = handle_bad_irq;
-        desc->chip = &no_irq_chip;
-        desc->name = NULL;
-        clear_kstat_irqs(desc);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-/**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
-{
-        dynamic_irq_cleanup_x(irq, false);
-}
-/**
- *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
- *      @irq:   irq number to initialize
- *
- *      does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
-{
-        dynamic_irq_cleanup_x(irq, true);
-}
 /**
 *      set_irq_chip - set the irq chip for an irq
 *      @irq:   irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        raw_spin_lock_irqsave(&desc->lock, flags);
        irq_chip_set_defaults(chip);
-        desc->chip = chip;
+        desc->irq_data.chip = chip;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->handler_data = data;
+        desc->irq_data.handler_data = data;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->msi_desc = entry;
+        desc->irq_data.msi_desc = entry;
        if (entry)
                entry->irq = irq;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
                return -EINVAL;
        }
-        if (!desc->chip) {
+        if (!desc->irq_data.chip) {
                printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
                return -EINVAL;
        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->chip_data = data;
+        desc->irq_data.chip_data = data;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
 EXPORT_SYMBOL(set_irq_chip_data);
+struct irq_data *irq_get_irq_data(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        return desc ? &desc->irq_data : NULL;
+}
+EXPORT_SYMBOL_GPL(irq_get_irq_data);
 /**
 *      set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
 *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
 /*
 * default enable function
 */
-static void default_enable(unsigned int irq)
+static void default_enable(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(data);
-        desc->chip->unmask(irq);
+        desc->irq_data.chip->irq_unmask(&desc->irq_data);
        desc->status &= ~IRQ_MASKED;
 }
 /*
 * default disable function
 */
-static void default_disable(unsigned int irq)
+static void default_disable(struct irq_data *data)
 {
 }
 /*
 * default startup function
 */
-static unsigned int default_startup(unsigned int irq)
+static unsigned int default_startup(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(data);
-        desc->chip->enable(irq);
+        desc->irq_data.chip->irq_enable(data);
        return 0;
 }
 /*
 * default shutdown function
 */
-static void default_shutdown(unsigned int irq)
+static void default_shutdown(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(data);
-        desc->chip->mask(irq);
+        desc->irq_data.chip->irq_mask(&desc->irq_data);
        desc->status |= IRQ_MASKED;
 }
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+/* Temporary migration helpers */
+static void compat_irq_mask(struct irq_data *data)
+{
+        data->chip->mask(data->irq);
+}
+static void compat_irq_unmask(struct irq_data *data)
+{
+        data->chip->unmask(data->irq);
+}
+static void compat_irq_ack(struct irq_data *data)
+{
+        data->chip->ack(data->irq);
+}
+static void compat_irq_mask_ack(struct irq_data *data)
+{
+        data->chip->mask_ack(data->irq);
+}
+static void compat_irq_eoi(struct irq_data *data)
+{
+        data->chip->eoi(data->irq);
+}
+static void compat_irq_enable(struct irq_data *data)
+{
+        data->chip->enable(data->irq);
+}
+static void compat_irq_disable(struct irq_data *data)
+{
+        data->chip->disable(data->irq);
+}
+static void compat_irq_shutdown(struct irq_data *data)
+{
+        data->chip->shutdown(data->irq);
+}
+static unsigned int compat_irq_startup(struct irq_data *data)
+{
+        return data->chip->startup(data->irq);
+}
+static int compat_irq_set_affinity(struct irq_data *data,
+                                   const struct cpumask *dest, bool force)
+{
+        return data->chip->set_affinity(data->irq, dest);
+}
+static int compat_irq_set_type(struct irq_data *data, unsigned int type)
+{
+        return data->chip->set_type(data->irq, type);
+}
+static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+        return data->chip->set_wake(data->irq, on);
+}
+static int compat_irq_retrigger(struct irq_data *data)
+{
+        return data->chip->retrigger(data->irq);
+}
+static void compat_bus_lock(struct irq_data *data)
+{
+        data->chip->bus_lock(data->irq);
+}
+static void compat_bus_sync_unlock(struct irq_data *data)
+{
+        data->chip->bus_sync_unlock(data->irq);
+}
+#endif
 /*
 * Fixup enable/disable function pointers
 */
 void irq_chip_set_defaults(struct irq_chip *chip)
 {
-        if (!chip->enable)
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-                chip->enable = default_enable;
-        if (!chip->disable)
-                chip->disable = default_disable;
-        if (!chip->startup)
-                chip->startup = default_startup;
        /*
-         * We use chip->disable, when the user provided its own. When
+         * Compat fixup functions need to be before we set the
-         * we have default_disable set for chip->disable, then we need
+         * defaults for enable/disable/startup/shutdown
+         */
+        if (chip->enable)
+                chip->irq_enable = compat_irq_enable;
+        if (chip->disable)
+                chip->irq_disable = compat_irq_disable;
+        if (chip->shutdown)
+                chip->irq_shutdown = compat_irq_shutdown;
+        if (chip->startup)
+                chip->irq_startup = compat_irq_startup;
+#endif
+        /*
+         * The real defaults
+         */
+        if (!chip->irq_enable)
+                chip->irq_enable = default_enable;
+        if (!chip->irq_disable)
+                chip->irq_disable = default_disable;
+        if (!chip->irq_startup)
+                chip->irq_startup = default_startup;
+        /*
+         * We use chip->irq_disable, when the user provided its own. When
+         * we have default_disable set for chip->irq_disable, then we need
         * to use default_shutdown, otherwise the irq line is not
         * disabled on free_irq():
         */
-        if (!chip->shutdown)
+        if (!chip->irq_shutdown)
-                chip->shutdown = chip->disable != default_disable ?
+                chip->irq_shutdown = chip->irq_disable != default_disable ?
-                        chip->disable : default_shutdown;
+                        chip->irq_disable : default_shutdown;
-        if (!chip->name)
-                chip->name = chip->typename;
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
        if (!chip->end)
                chip->end = dummy_irq_chip.end;
+        /*
+         * Now fix up the remaining compat handlers
+         */
+        if (chip->bus_lock)
+                chip->irq_bus_lock = compat_bus_lock;
+        if (chip->bus_sync_unlock)
+                chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
+        if (chip->mask)
+                chip->irq_mask = compat_irq_mask;
+        if (chip->unmask)
+                chip->irq_unmask = compat_irq_unmask;
+        if (chip->ack)
+                chip->irq_ack = compat_irq_ack;
+        if (chip->mask_ack)
+                chip->irq_mask_ack = compat_irq_mask_ack;
+        if (chip->eoi)
+                chip->irq_eoi = compat_irq_eoi;
+        if (chip->set_affinity)
+                chip->irq_set_affinity = compat_irq_set_affinity;
+        if (chip->set_type)
+                chip->irq_set_type = compat_irq_set_type;
+        if (chip->set_wake)
+                chip->irq_set_wake = compat_irq_set_wake;
+        if (chip->retrigger)
+                chip->irq_retrigger = compat_irq_retrigger;
+#endif
 }
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc *desc)
 {
-        if (desc->chip->mask_ack)
+        if (desc->irq_data.chip->irq_mask_ack)
-                desc->chip->mask_ack(irq);
+                desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
        else {
-                desc->chip->mask(irq);
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
-                if (desc->chip->ack)
+                if (desc->irq_data.chip->irq_ack)
-                        desc->chip->ack(irq);
+                        desc->irq_data.chip->irq_ack(&desc->irq_data);
        }
        desc->status |= IRQ_MASKED;
 }
-static inline void mask_irq(struct irq_desc *desc, int irq)
+static inline void mask_irq(struct irq_desc *desc)
 {
-        if (desc->chip->mask) {
+        if (desc->irq_data.chip->irq_mask) {
-                desc->chip->mask(irq);
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
                desc->status |= IRQ_MASKED;
        }
 }
-static inline void unmask_irq(struct irq_desc *desc, int irq)
+static inline void unmask_irq(struct irq_desc *desc)
 {
-        if (desc->chip->unmask) {
+        if (desc->irq_data.chip->irq_unmask) {
-                desc->chip->unmask(irq);
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
                desc->status &= ~IRQ_MASKED;
        }
 }
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        mask_ack_irq(desc, irq);
+        mask_ack_irq(desc);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-                unmask_irq(desc, irq);
+                unmask_irq(desc);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
-                mask_irq(desc, irq);
+                mask_irq(desc);
                goto out;
        }
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out:
-        desc->chip->eoi(irq);
+        desc->irq_data.chip->irq_eoi(&desc->irq_data);
        raw_spin_unlock(&desc->lock);
 }
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
        if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
-                mask_ack_irq(desc, irq);
+                mask_ack_irq(desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
        /* Start handling the irq */
-        if (desc->chip->ack)
+        desc->irq_data.chip->irq_ack(&desc->irq_data);
-                desc->chip->ack(irq);
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                irqreturn_t action_ret;
                if (unlikely(!action)) {
-                        mask_irq(desc, irq);
+                        mask_irq(desc);
                        goto out_unlock;
                }
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                if (unlikely((desc->status &
                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                              (IRQ_PENDING | IRQ_MASKED))) {
-                        unmask_irq(desc, irq);
+                        unmask_irq(desc);
                }
                desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (desc->chip->ack)
+        if (desc->irq_data.chip->irq_ack)
-                desc->chip->ack(irq);
+                desc->irq_data.chip->irq_ack(&desc->irq_data);
        action_ret = handle_IRQ_event(irq, desc->action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        if (desc->chip->eoi)
+        if (desc->irq_data.chip->irq_eoi)
-                desc->chip->eoi(irq);
+                desc->irq_data.chip->irq_eoi(&desc->irq_data);
 }
 void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        if (!handle)
                handle = handle_bad_irq;
-        else if (desc->chip == &no_irq_chip) {
+        else if (desc->irq_data.chip == &no_irq_chip) {
                printk(KERN_WARNING "Trying to install %sinterrupt handler "
                       "for IRQ%d\n", is_chained ? "chained " : "", irq);
                /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                 * prevent us to setup the interrupt at all. Switch it to
                 * dummy_irq_chip for easy transition.
                 */
-                desc->chip = &dummy_irq_chip;
+                desc->irq_data.chip = &dummy_irq_chip;
        }
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-                if (desc->chip != &no_irq_chip)
+                if (desc->irq_data.chip != &no_irq_chip)
-                        mask_ack_irq(desc, irq);
+                        mask_ack_irq(desc);
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                desc->status &= ~IRQ_DISABLED;
                desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
                desc->depth = 0;
-                desc->chip->startup(irq);
+                desc->irq_data.chip->irq_startup(&desc->irq_data);
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        __set_irq_handler(irq, handle, 0, name);
 }
-void set_irq_noprobe(unsigned int irq)
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
                return;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status |= IRQ_NOPROBE;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-void set_irq_probe(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc) {
+        /* Sanitize flags */
-                printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+        set &= IRQF_MODIFY_MASK;
-                return;
+        clr &= IRQF_MODIFY_MASK;
-        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status &= ~IRQ_NOPROBE;
+        desc->status &= ~clr;
+        desc->status |= set;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..20dc5474947e
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the dummy interrupt chip implementation
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include "internals.h"
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(struct irq_data *data)
+{
+        struct irq_desc *desc = irq_data_to_desc(data);
+        print_irq_desc(data->irq, desc);
+        ack_bad_irq(data->irq);
+}
+/*
+ * NOP functions
+ */
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data)
+{
+        return 0;
+}
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static void compat_noop(unsigned int irq) { }
+#define END_INIT .end = compat_noop
+#else
+#define END_INIT
+#endif
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+        .name           = "none",
+        .irq_startup    = noop_ret,
+        .irq_shutdown   = noop,
+        .irq_enable     = noop,
+        .irq_disable    = noop,
+        .irq_ack        = ack_bad,
+        END_INIT
+};
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+        .name           = "dummy",
+        .irq_startup    = noop_ret,
+        .irq_shutdown   = noop,
+        .irq_enable     = noop,
+        .irq_disable    = noop,
+        .irq_ack        = noop,
+        .irq_mask       = noop,
+        .irq_unmask     = noop,
+        END_INIT
+};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..e2347eb63306 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
 */
 #include <linux/irq.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
-#include <linux/rculist.h>
-#include <linux/hash.h>
-#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-struct lock_class_key irq_desc_lock_class;
 /**
 * handle_bad_irq - handle spurious and unhandled irqs
 * @irq:       the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
        ack_bad_irq(irq);
 }
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-static void __init init_irq_default_affinity(void)
-{
-        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-        cpumask_setall(irq_default_affinity);
-}
-#else
-static void __init init_irq_default_affinity(void)
-{
-}
-#endif
-/*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the appropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
- */
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
-#ifdef CONFIG_SPARSE_IRQ
-static struct irq_desc irq_desc_init = {
-        .irq        = -1,
-        .status     = IRQ_DISABLED,
-        .chip       = &no_irq_chip,
-        .handle_irq = handle_bad_irq,
-        .depth      = 1,
-        .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-};
-void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
-{
-        void *ptr;
-        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                           GFP_ATOMIC, node);
-        /*
-         * don't overwite if can not get new one
-         * init_copy_kstat_irqs() could still use old one
-         */
-        if (ptr) {
-                printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
-                desc->kstat_irqs = ptr;
-        }
-}
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
-{
-        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-        raw_spin_lock_init(&desc->lock);
-        desc->irq = irq;
-#ifdef CONFIG_SMP
-        desc->node = node;
-#endif
-        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_kstat_irqs(desc, node, nr_cpu_ids);
-        if (!desc->kstat_irqs) {
-                printk(KERN_ERR "can not alloc kstat_irqs\n");
-                BUG_ON(1);
-        }
-        if (!alloc_desc_masks(desc, node, false)) {
-                printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-                BUG_ON(1);
-        }
-        init_desc_masks(desc);
-        arch_init_chip_data(desc, node);
-}
-/*
- * Protect the sparse_irqs:
- */
-DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
-static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-        radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        return radix_tree_lookup(&irq_desc_tree, irq);
-}
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-        void **ptr;
-        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
-        if (ptr)
-                radix_tree_replace_slot(ptr, desc);
-}
-static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
-        [0 ... NR_IRQS_LEGACY-1] = {
-                .irq        = -1,
-                .status     = IRQ_DISABLED,
-                .chip       = &no_irq_chip,
-                .handle_irq = handle_bad_irq,
-                .depth      = 1,
-                .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-        }
-};
-static unsigned int *kstat_irqs_legacy;
-int __init early_irq_init(void)
-{
-        struct irq_desc *desc;
-        int legacy_count;
-        int node;
-        int i;
-        init_irq_default_affinity();
-         /* initialize nr_irqs based on nr_cpu_ids */
-        arch_probe_nr_irqs();
-        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
-        desc = irq_desc_legacy;
-        legacy_count = ARRAY_SIZE(irq_desc_legacy);
-        node = first_online_node;
-        /* allocate based on nr_cpu_ids */
-        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-                                          sizeof(int), GFP_NOWAIT, node);
-        for (i = 0; i < legacy_count; i++) {
-                desc[i].irq = i;
-#ifdef CONFIG_SMP
-                desc[i].node = node;
-#endif
-                desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
-                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                alloc_desc_masks(&desc[i], node, true);
-                init_desc_masks(&desc[i]);
-                set_irq_desc(i, &desc[i]);
-        }
-        return arch_early_irq_init();
-}
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        struct irq_desc *desc;
-        unsigned long flags;
-        if (irq >= nr_irqs) {
-                WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
-                        irq, nr_irqs);
-                return NULL;
-        }
-        desc = irq_to_desc(irq);
-        if (desc)
-                return desc;
-        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-        /* We have to check it to avoid races with another CPU */
-        desc = irq_to_desc(irq);
-        if (desc)
-                goto out_unlock;
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
-        if (!desc) {
-                printk(KERN_ERR "can not alloc irq_desc\n");
-                BUG_ON(1);
-        }
-        init_one_irq_desc(irq, desc, node);
-        set_irq_desc(irq, desc);
-out_unlock:
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        return desc;
-}
-#else /* !CONFIG_SPARSE_IRQ */
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
-        [0 ... NR_IRQS-1] = {
-                .status = IRQ_DISABLED,
-                .chip = &no_irq_chip,
-                .handle_irq = handle_bad_irq,
-                .depth = 1,
-                .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
-        }
-};
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
-int __init early_irq_init(void)
-{
-        struct irq_desc *desc;
-        int count;
-        int i;
-        init_irq_default_affinity();
-        printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
-        desc = irq_desc;
-        count = ARRAY_SIZE(irq_desc);
-        for (i = 0; i < count; i++) {
-                desc[i].irq = i;
-                alloc_desc_masks(&desc[i], 0, true);
-                init_desc_masks(&desc[i]);
-                desc[i].kstat_irqs = kstat_irqs_all[i];
-        }
-        return arch_early_irq_init();
-}
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        return irq_to_desc(irq);
-}
-#endif /* !CONFIG_SPARSE_IRQ */
-void clear_kstat_irqs(struct irq_desc *desc)
-{
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
-}
-/*
- * What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
- */
-static void ack_bad(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        print_irq_desc(irq, desc);
-        ack_bad_irq(irq);
-}
-/*
- * NOP functions
- */
-static void noop(unsigned int irq)
-{
-}
-static unsigned int noop_ret(unsigned int irq)
-{
-        return 0;
-}
-/*
- * Generic no controller implementation
- */
-struct irq_chip no_irq_chip = {
-        .name           = "none",
-        .startup        = noop_ret,
-        .shutdown       = noop,
-        .enable         = noop,
-        .disable        = noop,
-        .ack            = ack_bad,
-        .end            = noop,
-};
-/*
- * Generic dummy implementation which can be used for
- * real dumb interrupt sources
- */
-struct irq_chip dummy_irq_chip = {
-        .name           = "dummy",
-        .startup        = noop_ret,
-        .shutdown       = noop,
-        .enable         = noop,
-        .disable        = noop,
-        .ack            = noop,
-        .mask           = noop,
-        .unmask         = noop,
-        .end            = noop,
-};
 /*
 * Special, empty irq handler:
 */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-                if (desc->chip->ack)
+                if (desc->irq_data.chip->ack)
-                        desc->chip->ack(irq);
+                        desc->irq_data.chip->ack(irq);
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
                                note_interrupt(irq, desc, action_ret);
                }
-                desc->chip->end(irq);
+                desc->irq_data.chip->end(irq);
                return 1;
        }
        raw_spin_lock(&desc->lock);
-        if (desc->chip->ack)
+        if (desc->irq_data.chip->ack)
-                desc->chip->ack(irq);
+                desc->irq_data.chip->ack(irq);
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
         * The ->end() handler has to deal with interrupts which got
         * disabled while the handler was running.
         */
-        desc->chip->end(irq);
+        desc->irq_data.chip->end(irq);
        raw_spin_unlock(&desc->lock);
        return 1;
 }
 #endif
-void early_init_irq_lock_class(void)
-{
-        struct irq_desc *desc;
-        int i;
-        for_each_irq_desc(i, desc) {
-                lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        }
-}
-unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        return desc ? desc->kstat_irqs[cpu] : 0;
-}
-EXPORT_SYMBOL(kstat_irqs_cpu);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..4571ae7e085a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
 /*
 * IRQ subsystem internal functions and variables:
 */
+#include <linux/irqdesc.h>
 extern int noirqdebug;
+#define irq_data_to_desc(data)  container_of(data, struct irq_desc, irq_data)
 /* Set default functions for irq_chip structures: */
 extern void irq_chip_set_defaults(struct irq_chip *chip);
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-extern void clear_kstat_irqs(struct irq_desc *desc);
-extern raw_spinlock_t sparse_irq_lock;
-#ifdef CONFIG_SPARSE_IRQ
+/* Resending of interrupts :*/
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
-#endif
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
 extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
 #else
 static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
 static inline void register_handler_proc(unsigned int irq,
                                         struct irqaction *action) { }
 static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static inline void irq_end(unsigned int irq, struct irq_desc *desc)
+{
+        if (desc->irq_data.chip && desc->irq_data.chip->end)
+                desc->irq_data.chip->end(irq);
+}
+#else
+static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
+#endif
 /* Inline functions for support of irq chips on slow busses */
-static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_lock(struct irq_desc *desc)
 {
-        if (unlikely(desc->chip->bus_lock))
+        if (unlikely(desc->irq_data.chip->irq_bus_lock))
-                desc->chip->bus_lock(irq);
+                desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
 }
-static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 {
-        if (unlikely(desc->chip->bus_sync_unlock))
+        if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
-                desc->chip->bus_sync_unlock(irq);
+                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 /*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
        printk("->handle_irq():  %p, ", desc->handle_irq);
        print_symbol("%s\n", (unsigned long)desc->handle_irq);
-        printk("->chip(): %p, ", desc->chip);
+        printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
-        print_symbol("%s\n", (unsigned long)desc->chip);
+        print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
        printk("->action(): %p\n", desc->action);
        if (desc->action) {
                printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..9d917ff72675
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the interrupt descriptor management code
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/radix-tree.h>
+#include <linux/bitmap.h>
+#include "internals.h"
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+        cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+        if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+                return -ENOMEM;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+                free_cpumask_var(desc->irq_data.affinity);
+                return -ENOMEM;
+        }
+#endif
+        return 0;
+}
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+        desc->irq_data.node = node;
+        cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        cpumask_clear(desc->pending_mask);
+#endif
+}
+static inline int desc_node(struct irq_desc *desc)
+{
+        return desc->irq_data.node;
+}
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline int desc_node(struct irq_desc *desc) { return 0; }
+#endif
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+{
+        desc->irq_data.irq = irq;
+        desc->irq_data.chip = &no_irq_chip;
+        desc->irq_data.chip_data = NULL;
+        desc->irq_data.handler_data = NULL;
+        desc->irq_data.msi_desc = NULL;
+        desc->status = IRQ_DEFAULT_INIT_FLAGS;
+        desc->handle_irq = handle_bad_irq;
+        desc->depth = 1;
+        desc->irq_count = 0;
+        desc->irqs_unhandled = 0;
+        desc->name = NULL;
+        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+        desc_smp_init(desc, node);
+}
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+static DEFINE_MUTEX(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+#ifdef CONFIG_SPARSE_IRQ
+static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+static void delete_irq_desc(unsigned int irq)
+{
+        radix_tree_delete(&irq_desc_tree, irq);
+}
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        free_cpumask_var(desc->pending_mask);
+#endif
+        free_cpumask_var(desc->irq_data.affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+static struct irq_desc *alloc_desc(int irq, int node)
+{
+        struct irq_desc *desc;
+        gfp_t gfp = GFP_KERNEL;
+        desc = kzalloc_node(sizeof(*desc), gfp, node);
+        if (!desc)
+                return NULL;
+        /* allocate based on nr_cpu_ids */
+        desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+                                         gfp, node);
+        if (!desc->kstat_irqs)
+                goto err_desc;
+        if (alloc_masks(desc, gfp, node))
+                goto err_kstat;
+        raw_spin_lock_init(&desc->lock);
+        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+        desc_set_defaults(irq, desc, node);
+        return desc;
+err_kstat:
+        kfree(desc->kstat_irqs);
+err_desc:
+        kfree(desc);
+        return NULL;
+}
+static void free_desc(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unregister_irq_proc(irq, desc);
+        mutex_lock(&sparse_irq_lock);
+        delete_irq_desc(irq);
+        mutex_unlock(&sparse_irq_lock);
+        free_masks(desc);
+        kfree(desc->kstat_irqs);
+        kfree(desc);
+}
+static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+        struct irq_desc *desc;
+        int i;
+        for (i = 0; i < cnt; i++) {
+                desc = alloc_desc(start + i, node);
+                if (!desc)
+                        goto err;
+                mutex_lock(&sparse_irq_lock);
+                irq_insert_desc(start + i, desc);
+                mutex_unlock(&sparse_irq_lock);
+        }
+        return start;
+err:
+        for (i--; i >= 0; i--)
+                free_desc(start + i);
+        mutex_lock(&sparse_irq_lock);
+        bitmap_clear(allocated_irqs, start, cnt);
+        mutex_unlock(&sparse_irq_lock);
+        return -ENOMEM;
+}
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+        int res = irq_alloc_descs(irq, irq, 1, node);
+        if (res == -EEXIST || res == irq)
+                return irq_to_desc(irq);
+        return NULL;
+}
+int __init early_irq_init(void)
+{
+        int i, initcnt, node = first_online_node;
+        struct irq_desc *desc;
+        init_irq_default_affinity();
+        /* Let arch update nr_irqs and return the nr of preallocated irqs */
+        initcnt = arch_probe_nr_irqs();
+        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+        for (i = 0; i < initcnt; i++) {
+                desc = alloc_desc(i, node);
+                set_bit(i, allocated_irqs);
+                irq_insert_desc(i, desc);
+        }
+        return arch_early_irq_init();
+}
+#else /* !CONFIG_SPARSE_IRQ */
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+        [0 ... NR_IRQS-1] = {
+                .status         = IRQ_DEFAULT_INIT_FLAGS,
+                .handle_irq     = handle_bad_irq,
+                .depth          = 1,
+                .lock           = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
+        }
+};
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
+int __init early_irq_init(void)
+{
+        int count, i, node = first_online_node;
+        struct irq_desc *desc;
+        init_irq_default_affinity();
+        printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+        desc = irq_desc;
+        count = ARRAY_SIZE(irq_desc);
+        for (i = 0; i < count; i++) {
+                desc[i].irq_data.irq = i;
+                desc[i].irq_data.chip = &no_irq_chip;
+                desc[i].kstat_irqs = kstat_irqs_all[i];
+                alloc_masks(desc + i, GFP_KERNEL, node);
+                desc_smp_init(desc + i, node);
+                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+        }
+        return arch_early_irq_init();
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+        return irq_to_desc(irq);
+}
+static void free_desc(unsigned int irq)
+{
+        dynamic_irq_cleanup(irq);
+}
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+        return start;
+}
+#endif /* !CONFIG_SPARSE_IRQ */
+/* Dynamic interrupt handling */
+/**
+ * irq_free_descs - free irq descriptors
+ * @from:       Start of descriptor range
+ * @cnt:        Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+        int i;
+        if (from >= nr_irqs || (from + cnt) > nr_irqs)
+                return;
+        for (i = 0; i < cnt; i++)
+                free_desc(from + i);
+        mutex_lock(&sparse_irq_lock);
+        bitmap_clear(allocated_irqs, from, cnt);
+        mutex_unlock(&sparse_irq_lock);
+}
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq:        Allocate for specific irq number if irq >= 0
+ * @from:       Start the search from this irq number
+ * @cnt:        Number of consecutive irqs to allocate.
+ * @node:       Preferred node on which the irq descriptor should be allocated
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+{
+        int start, ret;
+        if (!cnt)
+                return -EINVAL;
+        mutex_lock(&sparse_irq_lock);
+        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+        ret = -EEXIST;
+        if (irq >=0 && start != irq)
+                goto err;
+        ret = -ENOMEM;
+        if (start >= nr_irqs)
+                goto err;
+        bitmap_set(allocated_irqs, start, cnt);
+        mutex_unlock(&sparse_irq_lock);
+        return alloc_descs(start, cnt, node);
+err:
+        mutex_unlock(&sparse_irq_lock);
+        return ret;
+}
+/**
+ * irq_reserve_irqs - mark irqs allocated
+ * @from:       mark from irq number
+ * @cnt:        number of irqs to mark
+ *
+ * Returns 0 on success or an appropriate error code
+ */
+int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+{
+        unsigned int start;
+        int ret = 0;
+        if (!cnt || (from + cnt) > nr_irqs)
+                return -EINVAL;
+        mutex_lock(&sparse_irq_lock);
+        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+        if (start == from)
+                bitmap_set(allocated_irqs, start, cnt);
+        else
+                ret = -EEXIST;
+        mutex_unlock(&sparse_irq_lock);
+        return ret;
+}
+/**
+ * irq_get_next_irq - get next allocated irq number
+ * @offset:     where to start the search
+ *
+ * Returns next irq number after offset or nr_irqs if none is found.
+ */
+unsigned int irq_get_next_irq(unsigned int offset)
+{
+        return find_next_bit(allocated_irqs, nr_irqs, offset);
+}
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq:        irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        desc_set_defaults(irq, desc, desc_node(desc));
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        return desc ? desc->kstat_irqs[cpu] : 0;
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a3..644e8d5fa367 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
-            !desc->chip->set_affinity)
+            !desc->irq_data.chip->irq_set_affinity)
                return 0;
        return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_chip *chip = desc->irq_data.chip;
        unsigned long flags;
-        if (!desc->chip->set_affinity)
+        if (!chip->irq_set_affinity)
                return -EINVAL;
        raw_spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT) {
-                if (!desc->chip->set_affinity(irq, cpumask)) {
+                if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-                        cpumask_copy(desc->affinity, cpumask);
+                        cpumask_copy(desc->irq_data.affinity, cpumask);
                        irq_set_thread_affinity(desc);
                }
        }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
                cpumask_copy(desc->pending_mask, cpumask);
        }
 #else
-        if (!desc->chip->set_affinity(irq, cpumask)) {
+        if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-                cpumask_copy(desc->affinity, cpumask);
+                cpumask_copy(desc->irq_data.affinity, cpumask);
                irq_set_thread_affinity(desc);
        }
 #endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
         * one of the targets is online.
         */
        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-                if (cpumask_any_and(desc->affinity, cpu_online_mask)
+                if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
                    < nr_cpu_ids)
                        goto set_affinity;
                else
                        desc->status &= ~IRQ_AFFINITY_SET;
        }
-        cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+        cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
-        desc->chip->set_affinity(irq, desc->affinity);
+        desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
        return 0;
 }
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
        if (!desc->depth++) {
                desc->status |= IRQ_DISABLED;
-                desc->chip->disable(irq);
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
        }
 }
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        __disable_irq(desc, irq, false);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 *      IRQ line is re-enabled.
 *
 *      This function may be called from IRQ context only when
- *      desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ *      desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
 */
 void enable_irq(unsigned int irq)
 {
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;
-        if (desc->chip->set_wake)
+        if (desc->irq_data.chip->irq_set_wake)
-                ret = desc->chip->set_wake(irq, on);
+                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
        return ret;
 }
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
 }
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
-                unsigned long flags)
+                      unsigned long flags)
 {
        int ret;
-        struct irq_chip *chip = desc->chip;
+        struct irq_chip *chip = desc->irq_data.chip;
-        if (!chip || !chip->set_type) {
+        if (!chip || !chip->irq_set_type) {
                /*
                 * IRQF_TRIGGER_* but the PIC does not support multiple
                 * flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
        }
        /* caller masked out all except trigger mode flags */
-        ret = chip->set_type(irq, flags);
+        ret = chip->irq_set_type(&desc->irq_data, flags);
        if (ret)
-                pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+                pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
-                                (int)flags, irq, chip->set_type);
+                       flags, irq, chip->irq_set_type);
        else {
                if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
                        flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
                desc->status |= flags;
-                if (chip != desc->chip)
+                if (chip != desc->irq_data.chip)
-                        irq_chip_set_defaults(desc->chip);
+                        irq_chip_set_defaults(desc->irq_data.chip);
        }
        return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
 again:
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irq(&desc->lock);
        /*
@@ -521,17 +522,17 @@ again:
         */
        if (unlikely(desc->status & IRQ_INPROGRESS)) {
                raw_spin_unlock_irq(&desc->lock);
-                chip_bus_sync_unlock(irq, desc);
+                chip_bus_sync_unlock(desc);
                cpu_relax();
                goto again;
        }
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
-                desc->chip->unmask(irq);
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
        }
        raw_spin_unlock_irq(&desc->lock);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 #ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
        }
        raw_spin_lock_irq(&desc->lock);
-        cpumask_copy(mask, desc->affinity);
+        cpumask_copy(mask, desc->irq_data.affinity);
        raw_spin_unlock_irq(&desc->lock);
        set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        if (!desc)
                return -EINVAL;
-        if (desc->chip == &no_irq_chip)
+        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
        /*
         * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        if (!shared) {
-                irq_chip_set_defaults(desc->chip);
+                irq_chip_set_defaults(desc->irq_data.chip);
                init_waitqueue_head(&desc->wait_for_threads);
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (!(desc->status & IRQ_NOAUTOEN)) {
                        desc->depth = 0;
                        desc->status &= ~IRQ_DISABLED;
-                        desc->chip->startup(irq);
+                        desc->irq_data.chip->irq_startup(&desc->irq_data);
                } else
                        /* Undo nested disables: */
                        desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Currently used only by UML, might disappear one day: */
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-        if (desc->chip->release)
+        if (desc->irq_data.chip->release)
-                desc->chip->release(irq, dev_id);
+                desc->irq_data.chip->release(irq, dev_id);
 #endif
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
                desc->status |= IRQ_DISABLED;
-                if (desc->chip->shutdown)
+                if (desc->irq_data.chip->irq_shutdown)
-                        desc->chip->shutdown(irq);
+                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                else
-                        desc->chip->disable(irq);
+                        desc->irq_data.chip->irq_disable(&desc->irq_data);
        }
 #ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        kfree(__free_irq(irq, dev_id));
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(free_irq);
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->name = devname;
        action->dev_id = dev_id;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
        if (retval)
                kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..1d2541940480 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
 void move_masked_irq(int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_chip *chip = desc->irq_data.chip;
        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
                return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
        if (unlikely(cpumask_empty(desc->pending_mask)))
                return;
-        if (!desc->chip->set_affinity)
+        if (!chip->irq_set_affinity)
                return;
        assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
                   < nr_cpu_ids))
-                if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+                if (!chip->irq_set_affinity(&desc->irq_data,
-                        cpumask_copy(desc->affinity, desc->pending_mask);
+                                            desc->pending_mask, false)) {
+                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
                        irq_set_thread_affinity(desc);
                }
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
        if (unlikely(desc->status & IRQ_DISABLED))
                return;
-        desc->chip->mask(irq);
+        desc->irq_data.chip->irq_mask(&desc->irq_data);
        move_masked_irq(irq);
-        desc->chip->unmask(irq);
+        desc->irq_data.chip->irq_unmask(&desc->irq_data);
 }
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NUMA irq-desc migration code
- *
- * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
- * the new "home node" of the IRQ.
- */
-#include <linux/irq.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include "internals.h"
-static void init_copy_kstat_irqs(struct irq_desc *old_desc,
-                                 struct irq_desc *desc,
-                                 int node, int nr)
-{
-        init_kstat_irqs(desc, node, nr);
-        if (desc->kstat_irqs != old_desc->kstat_irqs)
-                memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
-                         nr * sizeof(*desc->kstat_irqs));
-}
-static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-        if (old_desc->kstat_irqs == desc->kstat_irqs)
-                return;
-        kfree(old_desc->kstat_irqs);
-        old_desc->kstat_irqs = NULL;
-}
-static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                 struct irq_desc *desc, int node)
-{
-        memcpy(desc, old_desc, sizeof(struct irq_desc));
-        if (!alloc_desc_masks(desc, node, false)) {
-                printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-                                "for migration.\n", irq);
-                return false;
-        }
-        raw_spin_lock_init(&desc->lock);
-        desc->node = node;
-        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
-        init_copy_desc_masks(old_desc, desc);
-        arch_init_copy_chip_data(old_desc, desc, node);
-        return true;
-}
-static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-        free_kstat_irqs(old_desc, desc);
-        free_desc_masks(old_desc, desc);
-        arch_free_chip_data(old_desc, desc);
-}
-static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                                int node)
-{
-        struct irq_desc *desc;
-        unsigned int irq;
-        unsigned long flags;
-        irq = old_desc->irq;
-        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-        /* We have to check it to avoid races with another CPU */
-        desc = irq_to_desc(irq);
-        if (desc && old_desc != desc)
-                goto out_unlock;
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        if (!desc) {
-                printk(KERN_ERR "irq %d: can not get new irq_desc "
-                                "for migration.\n", irq);
-                /* still use old one */
-                desc = old_desc;
-                goto out_unlock;
-        }
-        if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
-                /* still use old one */
-                kfree(desc);
-                desc = old_desc;
-                goto out_unlock;
-        }
-        replace_irq_desc(irq, desc);
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        /* free the old one */
-        free_one_irq_desc(old_desc, desc);
-        kfree(old_desc);
-        return desc;
-out_unlock:
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        return desc;
-}
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
-{
-        /* those static or target node is -1, do not move them */
-        if (desc->irq < NR_IRQS_LEGACY || node == -1)
-                return desc;
-        if (desc->node != node)
-                desc = __real_move_irq_desc(desc, node);
-        return desc;
-}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..01b1d3a88983 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long)m->private);
-        const struct cpumask *mask = desc->affinity;
+        const struct cpumask *mask = desc->irq_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
        cpumask_var_t new_value;
        int err;
-        if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
+        if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
                return -EIO;
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long) m->private);
-        seq_printf(m, "%d\n", desc->node);
+        seq_printf(m, "%d\n", desc->irq_data.node);
        return 0;
 }
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
        char name [MAX_NAMELEN];
-        if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
+        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
                return;
        memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
                         &irq_spurious_proc_fops, (void *)(long)irq);
 }
+void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+        char name [MAX_NAMELEN];
+        if (!root_irq_dir || !desc->dir)
+                return;
+#ifdef CONFIG_SMP
+        remove_proc_entry("smp_affinity", desc->dir);
+        remove_proc_entry("affinity_hint", desc->dir);
+        remove_proc_entry("node", desc->dir);
+#endif
+        remove_proc_entry("spurious", desc->dir);
+        memset(name, 0, MAX_NAMELEN);
+        sprintf(name, "%u", irq);
+        remove_proc_entry(name, root_irq_dir);
+}
 #undef MAX_NAMELEN
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..891115a929aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
        /*
         * Make sure the interrupt is enabled, before resending it:
         */
-        desc->chip->enable(irq);
+        desc->irq_data.chip->irq_enable(&desc->irq_data);
        /*
         * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
        if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
                desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
-                if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
+                if (!desc->irq_data.chip->irq_retrigger ||
+                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
                        /* Set it pending and activate the softirq: */
                        set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..3089d3b9d5f3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
 #include <linux/moduleparam.h>
 #include <linux/timer.h>
+#include "internals.h"
 static int irqfixup __read_mostly;
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
         * If we did actual work for the real IRQ line we must let the
         * IRQ controller clean up too
         */
-        if (work && desc->chip && desc->chip->end)
+        if (work)
-                desc->chip->end(irq);
+                irq_end(irq, desc);
        raw_spin_unlock(&desc->lock);
        return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
                desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
                desc->depth++;
-                desc->chip->disable(irq);
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
                mod_timer(&poll_spurious_irq_timer,
                          jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+/*
+ * An entry can be in one of four states:
+ *
+ * free      NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+#define IRQ_WORK_PENDING        1UL
+#define IRQ_WORK_BUSY           2UL
+#define IRQ_WORK_FLAGS          3UL
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+        return (unsigned long)entry->next & flags;
+}
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+        unsigned long next = (unsigned long)entry->next;
+        next &= ~IRQ_WORK_FLAGS;
+        return (struct irq_work *)next;
+}
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+        unsigned long next = (unsigned long)entry;
+        next |= flags;
+        return (struct irq_work *)next;
+}
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+        struct irq_work *next, *nflags;
+        do {
+                next = entry->next;
+                if ((unsigned long)next & IRQ_WORK_PENDING)
+                        return false;
+                nflags = next_flags(next, IRQ_WORK_FLAGS);
+        } while (cmpxchg(&entry->next, next, nflags) != next);
+        return true;
+}
+void __weak arch_irq_work_raise(void)
+{
+        /*
+         * Lame architectures will get the timer tick callback
+         */
+}
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+        struct irq_work **head, *next;
+        head = &get_cpu_var(irq_work_list);
+        do {
+                next = *head;
+                /* Can assign non-atomic because we keep the flags set. */
+                entry->next = next_flags(next, IRQ_WORK_FLAGS);
+        } while (cmpxchg(head, next, entry) != next);
+        /* The list was empty, raise self-interrupt to start processing. */
+        if (!irq_work_next(entry))
+                arch_irq_work_raise();
+        put_cpu_var(irq_work_list);
+}
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+        if (!irq_work_claim(entry)) {
+                /*
+                 * Already enqueued, can't do!
+                 */
+                return false;
+        }
+        __irq_work_queue(entry);
+        return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+        struct irq_work *list, **head;
+        head = &__get_cpu_var(irq_work_list);
+        if (*head == NULL)
+                return;
+        BUG_ON(!in_irq());
+        BUG_ON(!irqs_disabled());
+        list = xchg(head, NULL);
+        while (list != NULL) {
+                struct irq_work *entry = list;
+                list = irq_work_next(list);
+                /*
+                 * Clear the PENDING bit, after this point the @entry
+                 * can be re-used.
+                 */
+                entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+                entry->func(entry);
+                /*
+                 * Clear the BUSY bit and return to the free state if
+                 * no-one else claimed it meanwhile.
+                 */
+                cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+        }
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+        WARN_ON_ONCE(irqs_disabled());
+        while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+                cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+#ifdef HAVE_JUMP_LABEL
+#define JUMP_LABEL_HASH_BITS 6
+#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
+static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+struct jump_label_entry {
+        struct hlist_node hlist;
+        struct jump_entry *table;
+        int nr_entries;
+        /* hang modules off here */
+        struct hlist_head modules;
+        unsigned long key;
+};
+struct jump_label_module_entry {
+        struct hlist_node hlist;
+        struct jump_entry *table;
+        int nr_entries;
+        struct module *mod;
+};
+static int jump_label_cmp(const void *a, const void *b)
+{
+        const struct jump_entry *jea = a;
+        const struct jump_entry *jeb = b;
+        if (jea->key < jeb->key)
+                return -1;
+        if (jea->key > jeb->key)
+                return 1;
+        return 0;
+}
+static void
+sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+        unsigned long size;
+        size = (((unsigned long)stop - (unsigned long)start)
+                                        / sizeof(struct jump_entry));
+        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct jump_label_entry *e;
+        u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+        head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+        hlist_for_each_entry(e, node, head, hlist) {
+                if (key == e->key)
+                        return e;
+        }
+        return NULL;
+}
+static struct jump_label_entry *
+add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+{
+        struct hlist_head *head;
+        struct jump_label_entry *e;
+        u32 hash;
+        e = get_jump_label_entry(key);
+        if (e)
+                return ERR_PTR(-EEXIST);
+        e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
+        if (!e)
+                return ERR_PTR(-ENOMEM);
+        hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+        head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+        e->key = key;
+        e->table = table;
+        e->nr_entries = nr_entries;
+        INIT_HLIST_HEAD(&(e->modules));
+        hlist_add_head(&e->hlist, head);
+        return e;
+}
+static int
+build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
+{
+        struct jump_entry *iter, *iter_begin;
+        struct jump_label_entry *entry;
+        int count;
+        sort_jump_label_entries(start, stop);
+        iter = start;
+        while (iter < stop) {
+                entry = get_jump_label_entry(iter->key);
+                if (!entry) {
+                        iter_begin = iter;
+                        count = 0;
+                        while ((iter < stop) &&
+                                (iter->key == iter_begin->key)) {
+                                iter++;
+                                count++;
+                        }
+                        entry = add_jump_label_entry(iter_begin->key,
+                                                        count, iter_begin);
+                        if (IS_ERR(entry))
+                                return PTR_ERR(entry);
+                 } else {
+                        WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
+                        return -1;
+                }
+        }
+        return 0;
+}
+/***
+ * jump_label_update - update jump label text
+ * @key -  key value associated with a a jump label
+ * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
+ *
+ * Will enable/disable the jump for jump label @key, depending on the
+ * value of @type.
+ *
+ */
+void jump_label_update(unsigned long key, enum jump_label_type type)
+{
+        struct jump_entry *iter;
+        struct jump_label_entry *entry;
+        struct hlist_node *module_node;
+        struct jump_label_module_entry *e_module;
+        int count;
+        mutex_lock(&jump_label_mutex);
+        entry = get_jump_label_entry((jump_label_t)key);
+        if (entry) {
+                count = entry->nr_entries;
+                iter = entry->table;
+                while (count--) {
+                        if (kernel_text_address(iter->code))
+                                arch_jump_label_transform(iter, type);
+                        iter++;
+                }
+                /* eanble/disable jump labels in modules */
+                hlist_for_each_entry(e_module, module_node, &(entry->modules),
+                                                        hlist) {
+                        count = e_module->nr_entries;
+                        iter = e_module->table;
+                        while (count--) {
+                                if (kernel_text_address(iter->code))
+                                        arch_jump_label_transform(iter, type);
+                                iter++;
+                        }
+                }
+        }
+        mutex_unlock(&jump_label_mutex);
+}
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+        if (entry->code <= (unsigned long)end &&
+                entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+                return 1;
+        return 0;
+}
+#ifdef CONFIG_MODULES
+static int module_conflict(void *start, void *end)
+{
+        struct hlist_head *head;
+        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_label_entry *e;
+        struct jump_label_module_entry *e_module;
+        struct jump_entry *iter;
+        int i, count;
+        int conflict = 0;
+        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+                head = &jump_label_table[i];
+                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                        hlist_for_each_entry_safe(e_module, module_node,
+                                                        module_node_next,
+                                                        &(e->modules), hlist) {
+                                count = e_module->nr_entries;
+                                iter = e_module->table;
+                                while (count--) {
+                                        if (addr_conflict(iter, start, end)) {
+                                                conflict = 1;
+                                                goto out;
+                                        }
+                                        iter++;
+                                }
+                        }
+                }
+        }
+out:
+        return conflict;
+}
+#endif
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+        struct jump_entry *iter;
+        struct jump_entry *iter_start = __start___jump_table;
+        struct jump_entry *iter_stop = __start___jump_table;
+        int conflict = 0;
+        mutex_lock(&jump_label_mutex);
+        iter = iter_start;
+        while (iter < iter_stop) {
+                if (addr_conflict(iter, start, end)) {
+                        conflict = 1;
+                        goto out;
+                }
+                iter++;
+        }
+        /* now check modules */
+#ifdef CONFIG_MODULES
+        conflict = module_conflict(start, end);
+#endif
+out:
+        mutex_unlock(&jump_label_mutex);
+        return conflict;
+}
+static __init int init_jump_label(void)
+{
+        int ret;
+        struct jump_entry *iter_start = __start___jump_table;
+        struct jump_entry *iter_stop = __stop___jump_table;
+        struct jump_entry *iter;
+        mutex_lock(&jump_label_mutex);
+        ret = build_jump_label_hashtable(__start___jump_table,
+                                         __stop___jump_table);
+        iter = iter_start;
+        while (iter < iter_stop) {
+                arch_jump_label_text_poke_early(iter->code);
+                iter++;
+        }
+        mutex_unlock(&jump_label_mutex);
+        return ret;
+}
+early_initcall(init_jump_label);
+#ifdef CONFIG_MODULES
+static struct jump_label_module_entry *
+add_jump_label_module_entry(struct jump_label_entry *entry,
+                            struct jump_entry *iter_begin,
+                            int count, struct module *mod)
+{
+        struct jump_label_module_entry *e;
+        e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
+        if (!e)
+                return ERR_PTR(-ENOMEM);
+        e->mod = mod;
+        e->nr_entries = count;
+        e->table = iter_begin;
+        hlist_add_head(&e->hlist, &entry->modules);
+        return e;
+}
+static int add_jump_label_module(struct module *mod)
+{
+        struct jump_entry *iter, *iter_begin;
+        struct jump_label_entry *entry;
+        struct jump_label_module_entry *module_entry;
+        int count;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return 0;
+        sort_jump_label_entries(mod->jump_entries,
+                                mod->jump_entries + mod->num_jump_entries);
+        iter = mod->jump_entries;
+        while (iter < mod->jump_entries + mod->num_jump_entries) {
+                entry = get_jump_label_entry(iter->key);
+                iter_begin = iter;
+                count = 0;
+                while ((iter < mod->jump_entries + mod->num_jump_entries) &&
+                        (iter->key == iter_begin->key)) {
+                                iter++;
+                                count++;
+                }
+                if (!entry) {
+                        entry = add_jump_label_entry(iter_begin->key, 0, NULL);
+                        if (IS_ERR(entry))
+                                return PTR_ERR(entry);
+                }
+                module_entry = add_jump_label_module_entry(entry, iter_begin,
+                                                           count, mod);
+                if (IS_ERR(module_entry))
+                        return PTR_ERR(module_entry);
+        }
+        return 0;
+}
+static void remove_jump_label_module(struct module *mod)
+{
+        struct hlist_head *head;
+        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_label_entry *e;
+        struct jump_label_module_entry *e_module;
+        int i;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return;
+        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+                head = &jump_label_table[i];
+                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                        hlist_for_each_entry_safe(e_module, module_node,
+                                                  module_node_next,
+                                                  &(e->modules), hlist) {
+                                if (e_module->mod == mod) {
+                                        hlist_del(&e_module->hlist);
+                                        kfree(e_module);
+                                }
+                        }
+                        if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
+                                hlist_del(&e->hlist);
+                                kfree(e);
+                        }
+                }
+        }
+}
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+                         void *data)
+{
+        struct module *mod = data;
+        int ret = 0;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                mutex_lock(&jump_label_mutex);
+                ret = add_jump_label_module(mod);
+                if (ret)
+                        remove_jump_label_module(mod);
+                mutex_unlock(&jump_label_mutex);
+                break;
+        case MODULE_STATE_GOING:
+                mutex_lock(&jump_label_mutex);
+                remove_jump_label_module(mod);
+                mutex_unlock(&jump_label_mutex);
+                break;
+        }
+        return ret;
+}
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+        struct jump_entry *iter;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return;
+        iter = mod->jump_entries;
+        while (iter < mod->jump_entries + mod->num_jump_entries) {
+                arch_jump_label_text_poke_early(iter->code);
+                iter++;
+        }
+}
+struct notifier_block jump_label_module_nb = {
+        .notifier_call = jump_label_module_notify,
+        .priority = 0,
+};
+static __init int init_jump_label_module(void)
+{
+        return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(init_jump_label_module);
+#endif /* CONFIG_MODULES */
+#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..7c44133f51ec 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
 #include <linux/memory.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/jump_label.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobes_all_disarmed;
-static DEFINE_MUTEX(kprobe_mutex);      /* Protects kprobe_table */
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
        spinlock_t lock ____cacheline_aligned_in_smp;
@@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p)
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 {
        int i;
        struct kprobe *p = NULL;
@@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
 }
 #ifdef CONFIG_SYSCTL
+/* This should be called with kprobe_mutex locked */
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = true;
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
-        mutex_unlock(&text_mutex);
        printk(KERN_INFO "Kprobes globally optimized\n");
 }
+/* This should be called with kprobe_mutex locked */
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
                         struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        spinlock_t *hlist_lock;
@@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 static void __kprobes kretprobe_table_lock(unsigned long hash,
        unsigned long *flags)
+__acquires(hlist_lock)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
        unsigned long *flags)
+__releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        spinlock_t *hlist_lock;
@@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
        spin_unlock_irqrestore(hlist_lock, *flags);
 }
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1148,8 @@ int __kprobes register_kprobe(struct kprobe *p)
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr) ||
-            ftrace_text_reserved(p->addr, p->addr)) {
+            ftrace_text_reserved(p->addr, p->addr) ||
+            jump_label_text_reserved(p->addr, p->addr)) {
                preempt_enable();
                return -EINVAL;
        }
@@ -1339,18 +1347,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
-                unsigned long addr;
+                unsigned long addr, offset;
                jp = jps[i];
                addr = arch_deref_entry_point(jp->entry);
-                if (!kernel_text_address(addr))
+                /* Verify probepoint is a function entry point */
-                        ret = -EINVAL;
+                if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
-                else {
+                    offset == 0) {
-                        /* Todo: Verify probepoint is a function entry point */
                        jp->kp.pre_handler = setjmp_pre_handler;
                        jp->kp.break_handler = longjmp_break_handler;
                        ret = register_kprobe(&jp->kp);
-                }
+                } else
+                        ret = -EINVAL;
                if (ret < 0) {
                        if (i > 0)
                                unregister_jprobes(jps, i);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        }
 #endif
+        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+                debug_locks_off();
+                printk(KERN_ERR
+                        "BUG: looking up invalid subclass: %u\n", subclass);
+                printk(KERN_ERR
+                        "turning off the locking correctness validator.\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * Static locks do not have their class-keys yet - for them the key
         * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
        raw_local_irq_restore(flags);
        if (!subclass || force)
-                lock->class_cache = class;
+                lock->class_cache[0] = class;
+        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+                lock->class_cache[subclass] = class;
        if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        lock->class_cache = NULL;
+        int i;
+        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+                lock->class_cache[i] = NULL;
 #ifdef CONFIG_LOCK_STAT
        lock->cpu = raw_smp_processor_id();
 #endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
-        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
-                debug_locks_off();
-                printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
-                printk("turning off the locking correctness validator.\n");
-                dump_stack();
-                return 0;
-        }
        if (lock->key == &__lockdep_no_validate__)
                check = 1;
-        if (!subclass)
+        if (subclass < NR_LOCKDEP_CACHING_CLASSES)
-                class = lock->class_cache;
+                class = lock->class_cache[subclass];
        /*
-         * Not cached yet or subclass?
+         * Not cached?
         */
        if (unlikely(!class)) {
                class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                return 1;
        if (hlock->references) {
-                struct lock_class *class = lock->class_cache;
+                struct lock_class *class = lock->class_cache[0];
                if (!class)
                        class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                if (list_empty(head))
                        continue;
                list_for_each_entry_safe(class, next, head, hash_entry) {
-                        if (unlikely(class == lock->class_cache)) {
+                        int match = 0;
+                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+                                match |= class == lock->class_cache[j];
+                        if (unlikely(match)) {
                                if (debug_locks_off_graph_unlock())
                                        WARN_ON(1);
                                goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
 * Careful: only use this function if you are sure that
 * the task cannot run in parallel!
 */
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
 {
        if (unlikely(!debug_locks)) {
                printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
        }
        lockdep_print_held_locks(task);
 }
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-void debug_show_held_locks(struct task_struct *task)
-{
-                __debug_show_held_locks(task);
-}
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c
index ccd641991842..2df46301a7a4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/jump_label.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
                                        sizeof(*mod->tracepoints),
                                        &mod->num_tracepoints);
 #endif
+#ifdef HAVE_JUMP_LABEL
+        mod->jump_entries = section_objs(info, "__jump_table",
+                                        sizeof(*mod->jump_entries),
+                                        &mod->num_jump_entries);
+#endif
 #ifdef CONFIG_EVENT_TRACING
        mod->trace_events = section_objs(info, "_ftrace_events",
                                         sizeof(*mod->trace_events),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b98bed3d8182..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
-/*
+atomic_t perf_task_events __read_mostly;
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-static atomic_t nr_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
 /*
 * perf event paranoia level:
 *  -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 static atomic64_t perf_event_id;
-/*
+void __weak perf_event_print_debug(void)        { }
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
-/*
+extern __weak const char *perf_pmu_name(void)
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 {
-        return NULL;
+        return "pmu";
 }
-void __weak hw_perf_disable(void)               { barrier(); }
+void perf_pmu_disable(struct pmu *pmu)
-void __weak hw_perf_enable(void)                { barrier(); }
+{
+        int *count = this_cpu_ptr(pmu->pmu_disable_count);
-void __weak perf_event_print_debug(void)        { }
+        if (!(*count)++)
+                pmu->pmu_disable(pmu);
-static DEFINE_PER_CPU(int, perf_disable_count);
+}
-void perf_disable(void)
+void perf_pmu_enable(struct pmu *pmu)
 {
-        if (!__get_cpu_var(perf_disable_count)++)
+        int *count = this_cpu_ptr(pmu->pmu_disable_count);
-                hw_perf_disable();
+        if (!--(*count))
+                pmu->pmu_enable(pmu);
 }
-void perf_enable(void)
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
 {
-        if (!--__get_cpu_var(perf_disable_count))
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                hw_perf_enable();
+        struct list_head *head = &__get_cpu_var(rotation_list);
+        WARN_ON(!irqs_disabled());
+        if (list_empty(&cpuctx->rotation_list))
+                list_add(&cpuctx->rotation_list, head);
 }
 static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
 * the context could get moved to another task.
 */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
        struct perf_event_context *ctx;
        rcu_read_lock();
- retry:
+retry:
-        ctx = rcu_dereference(task->perf_event_ctxp);
+        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                 * can't get swapped on us any more.
                 */
                raw_spin_lock_irqsave(&ctx->lock, *flags);
-                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        goto retry;
                }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
 {
        struct perf_event_context *ctx;
        unsigned long flags;
-        ctx = perf_lock_task_context(task, &flags);
+        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        }
        list_add_rcu(&event->event_entry, &ctx->event_list);
+        if (!ctx->nr_events)
+                perf_pmu_rotate_start(ctx->pmu);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
 {
        struct perf_event *group_leader = event->group_leader;
-        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+        /*
+         * We can have double attach due to group movement in perf_event_open.
+         */
+        if (event->attach_state & PERF_ATTACH_GROUP)
+                return;
        event->attach_state |= PERF_ATTACH_GROUP;
        if (group_leader == event)
@@ -436,7 +445,7 @@ event_sched_out(struct perf_event *event,
                event->state = PERF_EVENT_STATE_OFF;
        }
        event->tstamp_stopped = ctx->time;
-        event->pmu->disable(event);
+        event->pmu->del(event, 0);
        event->oncpu = -1;
        if (!is_software_event(event))
@@ -466,6 +475,12 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
 /*
 * Cross CPU call to remove a performance event
 *
@@ -474,9 +489,9 @@ group_sched_out(struct perf_event *group_event,
 */
 static void __perf_event_remove_from_context(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a task context, we need to check whether it is
@@ -487,27 +502,11 @@ static void __perf_event_remove_from_context(void *info)
                return;
        raw_spin_lock(&ctx->lock);
-        /*
-         * Protect the list operation against NMI by disabling the
-         * events on a global level.
-         */
-        perf_disable();
        event_sched_out(event, cpuctx, ctx);
        list_del_event(event, ctx);
-        if (!ctx->task) {
-                /*
-                 * Allow more per task events with respect to the
-                 * reservation:
-                 */
-                cpuctx->max_pertask =
-                        min(perf_max_events - ctx->nr_events,
-                            perf_max_events - perf_reserved_percpu);
-        }
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
@@ -572,8 +571,8 @@ retry:
 static void __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a per-task event, need to check whether this
@@ -628,7 +627,7 @@ void perf_event_disable(struct perf_event *event)
                return;
        }
- retry:
+retry:
        task_oncpu_function_call(task, __perf_event_disable, event);
        raw_spin_lock_irq(&ctx->lock);
@@ -667,7 +666,7 @@ event_sched_in(struct perf_event *event,
         */
        smp_wmb();
-        if (event->pmu->enable(event)) {
+        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
                return -EAGAIN;
@@ -691,22 +690,17 @@ group_sched_in(struct perf_event *group_event,
               struct perf_event_context *ctx)
 {
        struct perf_event *event, *partial_group = NULL;
-        const struct pmu *pmu = group_event->pmu;
+        struct pmu *pmu = group_event->pmu;
-        bool txn = false;
+        u64 now = ctx->time;
+        bool simulate = false;
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        /* Check if group transaction availabe */
+        pmu->start_txn(pmu);
-        if (pmu->start_txn)
-                txn = true;
-        if (txn)
-                pmu->start_txn(pmu);
        if (event_sched_in(group_event, cpuctx, ctx)) {
-                if (txn)
+                pmu->cancel_txn(pmu);
-                        pmu->cancel_txn(pmu);
                return -EAGAIN;
        }
@@ -720,23 +714,38 @@ group_sched_in(struct perf_event *group_event,
                }
        }
-        if (!txn || !pmu->commit_txn(pmu))
+        if (!pmu->commit_txn(pmu))
                return 0;
 group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
+         * The events up to the failed event are scheduled out normally,
+         * tstamp_stopped will be updated.
+         *
+         * The failed events and the remaining siblings need to have
+         * their timings updated as if they had gone thru event_sched_in()
+         * and event_sched_out(). This is required to get consistent timings
+         * across the group. This also takes care of the case where the group
+         * could never be scheduled by ensuring tstamp_stopped is set to mark
+         * the time the event was actually stopped, such that time delta
+         * calculation in update_event_times() is correct.
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                if (event == partial_group)
-                        break;
+                        simulate = true;
-                event_sched_out(event, cpuctx, ctx);
+                if (simulate) {
+                        event->tstamp_running += now - event->tstamp_stopped;
+                        event->tstamp_stopped = now;
+                } else {
+                        event_sched_out(event, cpuctx, ctx);
+                }
        }
        event_sched_out(group_event, cpuctx, ctx);
-        if (txn)
+        pmu->cancel_txn(pmu);
-                pmu->cancel_txn(pmu);
        return -EAGAIN;
 }
@@ -789,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event,
 */
 static void __perf_install_in_context(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
        /*
@@ -812,12 +821,6 @@ static void __perf_install_in_context(void *info)
        ctx->is_active = 1;
        update_context_time(ctx);
-        /*
-         * Protect the list operation against NMI by disabling the
-         * events on a global level. NOP for non NMI based events.
-         */
-        perf_disable();
        add_event_to_ctx(event, ctx);
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +858,7 @@ static void __perf_install_in_context(void *info)
                }
        }
-        if (!err && !ctx->task && cpuctx->max_pertask)
+unlock:
-                cpuctx->max_pertask--;
- unlock:
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
@@ -883,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
        struct task_struct *task = ctx->task;
+        event->ctx = ctx;
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
@@ -931,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
        event->state = PERF_EVENT_STATE_INACTIVE;
        event->tstamp_enabled = ctx->time - event->total_time_enabled;
-        list_for_each_entry(sub, &event->sibling_list, group_entry)
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
-                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
                        sub->tstamp_enabled =
                                ctx->time - sub->total_time_enabled;
+                }
+        }
 }
 /*
@@ -943,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 static void __perf_event_enable(void *info)
 {
        struct perf_event *event = info;
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
        /*
@@ -979,12 +981,10 @@ static void __perf_event_enable(void *info)
        if (!group_can_go_on(event, cpuctx, 1)) {
                err = -EEXIST;
        } else {
-                perf_disable();
                if (event == leader)
                        err = group_sched_in(event, cpuctx, ctx);
                else
                        err = event_sched_in(event, cpuctx, ctx);
-                perf_enable();
        }
        if (err) {
@@ -1000,7 +1000,7 @@ static void __perf_event_enable(void *info)
                }
        }
- unlock:
+unlock:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1041,7 +1041,7 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_ERROR)
                event->state = PERF_EVENT_STATE_OFF;
- retry:
+retry:
        raw_spin_unlock_irq(&ctx->lock);
        task_oncpu_function_call(task, __perf_event_enable, event);
@@ -1061,7 +1061,7 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_OFF)
                __perf_event_mark_enabled(event, ctx);
- out:
+out:
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -1092,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        struct perf_event *event;
        raw_spin_lock(&ctx->lock);
+        perf_pmu_disable(ctx->pmu);
        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
                goto out;
        update_context_time(ctx);
-        perf_disable();
        if (!ctx->nr_active)
-                goto out_enable;
+                goto out;
-        if (event_type & EVENT_PINNED)
+        if (event_type & EVENT_PINNED) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
-        if (event_type & EVENT_FLEXIBLE)
+        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
- out_enable:
+out:
-        perf_enable();
+        perf_pmu_enable(ctx->pmu);
- out:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1209,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        }
 }
-/*
+void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- * Called from scheduler to remove the events of the current task,
+                                  struct task_struct *next)
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-                                 struct task_struct *next)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
-        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
+        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+        if (likely(!ctx))
+                return;
-        if (likely(!ctx || !cpuctx->task_ctx))
+        cpuctx = __get_cpu_context(ctx);
+        if (!cpuctx->task_ctx)
                return;
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
-        next_ctx = next->perf_event_ctxp;
+        next_ctx = next->perf_event_ctxp[ctxn];
        if (parent && next_ctx &&
            rcu_dereference(next_ctx->parent_ctx) == parent) {
                /*
@@ -1255,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                         * XXX do we need a memory barrier of sorts
                         * wrt to rcu_dereference() of perf_event_ctxp
                         */
-                        task->perf_event_ctxp = next_ctx;
+                        task->perf_event_ctxp[ctxn] = next_ctx;
-                        next->perf_event_ctxp = ctx;
+                        next->perf_event_ctxp[ctxn] = ctx;
                        ctx->task = next;
                        next_ctx->task = task;
                        do_switch = 0;
@@ -1274,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task,
        }
 }
+#define for_each_task_context_nr(ctxn)                                  \
+        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+                                 struct task_struct *next)
+{
+        int ctxn;
+        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+        for_each_task_context_nr(ctxn)
+                perf_event_context_sched_out(task, ctxn, next);
+}
 static void task_ctx_sched_out(struct perf_event_context *ctx,
                               enum event_type_t event_type)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        if (!cpuctx->task_ctx)
                return;
@@ -1292,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 /*
 * Called with IRQs disabled
 */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
-        task_ctx_sched_out(ctx, EVENT_ALL);
-}
-/*
- * Called with IRQs disabled
- */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type)
 {
@@ -1350,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
-                if (group_can_go_on(event, cpuctx, can_add_hw))
+                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
+                }
        }
 }
@@ -1368,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx,
        ctx->timestamp = perf_clock();
-        perf_disable();
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
@@ -1381,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx,
        if (event_type & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
-        perf_enable();
+out:
- out:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1394,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
        ctx_sched_in(ctx, cpuctx, event_type);
 }
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
                              enum event_type_t event_type)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx = task->perf_event_ctxp;
-        if (likely(!ctx))
+        cpuctx = __get_cpu_context(ctx);
-                return;
        if (cpuctx->task_ctx == ctx)
                return;
        ctx_sched_in(ctx, cpuctx, event_type);
        cpuctx->task_ctx = ctx;
 }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
-{
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-        struct perf_event_context *ctx = task->perf_event_ctxp;
-        if (likely(!ctx))
+void perf_event_context_sched_in(struct perf_event_context *ctx)
-                return;
+{
+        struct perf_cpu_context *cpuctx;
+        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx)
                return;
-        perf_disable();
+        perf_pmu_disable(ctx->pmu);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1444,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task)
        cpuctx->task_ctx = ctx;
-        perf_enable();
+        /*
+         * Since these rotations are per-cpu, we need to ensure the
+         * cpu-context we got scheduled on is actually rotating.
+         */
+        perf_pmu_rotate_start(ctx->pmu);
+        perf_pmu_enable(ctx->pmu);
+}
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *task)
+{
+        struct perf_event_context *ctx;
+        int ctxn;
+        for_each_task_context_nr(ctxn) {
+                ctx = task->perf_event_ctxp[ctxn];
+                if (likely(!ctx))
+                        continue;
+                perf_event_context_sched_in(ctx);
+        }
 }
 #define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1545,6 @@ do {					\
        return div64_u64(dividend, divisor);
 }
-static void perf_event_stop(struct perf_event *event)
-{
-        if (!event->pmu->stop)
-                return event->pmu->disable(event);
-        return event->pmu->stop(event);
-}
-static int perf_event_start(struct perf_event *event)
-{
-        if (!event->pmu->start)
-                return event->pmu->enable(event);
-        return event->pmu->start(event);
-}
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +1564,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
        hwc->sample_period = sample_period;
        if (local64_read(&hwc->period_left) > 8*sample_period) {
-                perf_disable();
+                event->pmu->stop(event, PERF_EF_UPDATE);
-                perf_event_stop(event);
                local64_set(&hwc->period_left, 0);
-                perf_event_start(event);
+                event->pmu->start(event, PERF_EF_RELOAD);
-                perf_enable();
        }
 }
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
@@ -1592,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
-                        perf_disable();
+                        event->pmu->start(event, 0);
-                        event->pmu->unthrottle(event);
-                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                perf_disable();
                event->pmu->read(event);
                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;
                if (delta > 0)
-                        perf_adjust_period(event, TICK_NSEC, delta);
+                        perf_adjust_period(event, period, delta);
-                perf_enable();
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1626,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
        raw_spin_unlock(&ctx->lock);
 }
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
-        struct perf_cpu_context *cpuctx;
+        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
-        struct perf_event_context *ctx;
+        struct perf_event_context *ctx = NULL;
-        int rotate = 0;
+        int rotate = 0, remove = 1;
-        if (!atomic_read(&nr_events))
-                return;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
+        if (cpuctx->ctx.nr_events) {
-        if (cpuctx->ctx.nr_events &&
+                remove = 0;
-            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-                rotate = 1;
+                        rotate = 1;
+        }
-        ctx = curr->perf_event_ctxp;
+        ctx = cpuctx->task_ctx;
-        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+        if (ctx && ctx->nr_events) {
-                rotate = 1;
+                remove = 0;
+                if (ctx->nr_events != ctx->nr_active)
+                        rotate = 1;
+        }
-        perf_ctx_adjust_freq(&cpuctx->ctx);
+        perf_pmu_disable(cpuctx->ctx.pmu);
+        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
        if (ctx)
-                perf_ctx_adjust_freq(ctx);
+                perf_ctx_adjust_freq(ctx, interval);
        if (!rotate)
-                return;
+                goto done;
-        perf_disable();
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1662,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr)
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
-        perf_enable();
+done:
+        if (remove)
+                list_del_init(&cpuctx->rotation_list);
+        perf_pmu_enable(cpuctx->ctx.pmu);
+}
+void perf_event_task_tick(void)
+{
+        struct list_head *head = &__get_cpu_var(rotation_list);
+        struct perf_cpu_context *cpuctx, *tmp;
+        WARN_ON(!irqs_disabled());
+        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+                if (cpuctx->jiffies_interval == 1 ||
+                                !(jiffies % cpuctx->jiffies_interval))
+                        perf_rotate_context(cpuctx);
+        }
 }
 static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event,
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
-        struct perf_event_context *ctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
        int ret;
        local_irq_save(flags);
-        ctx = task->perf_event_ctxp;
        if (!ctx || !ctx->nr_events)
                goto out;
-        __perf_event_task_sched_out(ctx);
+        task_ctx_sched_out(ctx, EVENT_ALL);
        raw_spin_lock(&ctx->lock);
@@ -1722,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_unlock(&ctx->lock);
-        perf_event_task_sched_in(task);
+        perf_event_context_sched_in(ctx);
- out:
+out:
        local_irq_restore(flags);
 }
@@ -1732,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 */
 static void __perf_event_read(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a task context, we need to check whether it is
@@ -1773,7 +1795,13 @@ static u64 perf_event_read(struct perf_event *event)
                unsigned long flags;
                raw_spin_lock_irqsave(&ctx->lock, flags);
-                update_context_time(ctx);
+                /*
+                 * may read while context is not active
+                 * (e.g., thread is blocked), in that case
+                 * we cannot update context time
+                 */
+                if (ctx->is_active)
+                        update_context_time(ctx);
                update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -1782,11 +1810,219 @@ static u64 perf_event_read(struct perf_event *event)
 }
 /*
- * Initialize the perf_event context in a task_struct:
+ * Callchain support
 */
+struct callchain_cpus_entries {
+        struct rcu_head                 rcu_head;
+        struct perf_callchain_entry     *cpu_entries[0];
+};
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
+{
+}
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                                struct pt_regs *regs)
+{
+}
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+        struct callchain_cpus_entries *entries;
+        int cpu;
+        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+}
+static void release_callchain_buffers(void)
+{
+        struct callchain_cpus_entries *entries;
+        entries = callchain_cpus_entries;
+        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+static int alloc_callchain_buffers(void)
+{
+        int cpu;
+        int size;
+        struct callchain_cpus_entries *entries;
+        /*
+         * We can't use the percpu allocation API for data that can be
+         * accessed from NMI. Use a temporary manual per cpu allocation
+         * until that gets sorted out.
+         */
+        size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+                num_possible_cpus();
+        entries = kzalloc(size, GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+        for_each_possible_cpu(cpu) {
+                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                         cpu_to_node(cpu));
+                if (!entries->cpu_entries[cpu])
+                        goto fail;
+        }
+        rcu_assign_pointer(callchain_cpus_entries, entries);
+        return 0;
+fail:
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+        return -ENOMEM;
+}
+static int get_callchain_buffers(void)
+{
+        int err = 0;
+        int count;
+        mutex_lock(&callchain_mutex);
+        count = atomic_inc_return(&nr_callchain_events);
+        if (WARN_ON_ONCE(count < 1)) {
+                err = -EINVAL;
+                goto exit;
+        }
+        if (count > 1) {
+                /* If the allocation failed, give up */
+                if (!callchain_cpus_entries)
+                        err = -ENOMEM;
+                goto exit;
+        }
+        err = alloc_callchain_buffers();
+        if (err)
+                release_callchain_buffers();
+exit:
+        mutex_unlock(&callchain_mutex);
+        return err;
+}
+static void put_callchain_buffers(void)
+{
+        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+                release_callchain_buffers();
+                mutex_unlock(&callchain_mutex);
+        }
+}
+static int get_recursion_context(int *recursion)
+{
+        int rctx;
+        if (in_nmi())
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (recursion[rctx])
+                return -1;
+        recursion[rctx]++;
+        barrier();
+        return rctx;
+}
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+        barrier();
+        recursion[rctx]--;
+}
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+        int cpu;
+        struct callchain_cpus_entries *entries;
+        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        if (*rctx == -1)
+                return NULL;
+        entries = rcu_dereference(callchain_cpus_entries);
+        if (!entries)
+                return NULL;
+        cpu = smp_processor_id();
+        return &entries->cpu_entries[cpu][*rctx];
+}
 static void
-__perf_event_init_context(struct perf_event_context *ctx,
+put_callchain_entry(int rctx)
-                            struct task_struct *task)
+{
+        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        int rctx;
+        struct perf_callchain_entry *entry;
+        entry = get_callchain_entry(&rctx);
+        if (rctx == -1)
+                return NULL;
+        if (!entry)
+                goto exit_put;
+        entry->nr = 0;
+        if (!user_mode(regs)) {
+                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                perf_callchain_kernel(entry, regs);
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                perf_callchain_user(entry, regs);
+        }
+exit_put:
+        put_callchain_entry(rctx);
+        return entry;
+}
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
@@ -1794,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
-        ctx->task = task;
 }
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
 {
        struct perf_event_context *ctx;
-        struct perf_cpu_context *cpuctx;
-        struct task_struct *task;
-        unsigned long flags;
-        int err;
-        if (pid == -1 && cpu != -1) {
-                /* Must be root to operate on a CPU event: */
-                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu >= nr_cpumask_bits)
+        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-                        return ERR_PTR(-EINVAL);
+        if (!ctx)
+                return NULL;
-                /*
+        __perf_event_init_context(ctx);
-                 * We could be clever and allow to attach a event to an
+        if (task) {
-                 * offline CPU and activate it when the CPU comes up, but
+                ctx->task = task;
-                 * that's for later.
+                get_task_struct(task);
-                 */
+        }
-                if (!cpu_online(cpu))
+        ctx->pmu = pmu;
-                        return ERR_PTR(-ENODEV);
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
+        return ctx;
-                ctx = &cpuctx->ctx;
+}
-                get_ctx(ctx);
-                return ctx;
+static struct task_struct *
-        }
+find_lively_task_by_vpid(pid_t vpid)
+{
+        struct task_struct *task;
+        int err;
        rcu_read_lock();
-        if (!pid)
+        if (!vpid)
                task = current;
        else
-                task = find_task_by_vpid(pid);
+                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
@@ -1852,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto errout;
- retry:
+        return task;
-        ctx = perf_lock_task_context(task, &flags);
+errout:
+        put_task_struct(task);
+        return ERR_PTR(err);
+}
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+        struct perf_event_context *ctx;
+        struct perf_cpu_context *cpuctx;
+        unsigned long flags;
+        int ctxn, err;
+        if (!task && cpu != -1) {
+                /* Must be root to operate on a CPU event: */
+                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                        return ERR_PTR(-EACCES);
+                if (cpu < 0 || cpu >= nr_cpumask_bits)
+                        return ERR_PTR(-EINVAL);
+                /*
+                 * We could be clever and allow to attach a event to an
+                 * offline CPU and activate it when the CPU comes up, but
+                 * that's for later.
+                 */
+                if (!cpu_online(cpu))
+                        return ERR_PTR(-ENODEV);
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                ctx = &cpuctx->ctx;
+                get_ctx(ctx);
+                return ctx;
+        }
+        err = -EINVAL;
+        ctxn = pmu->task_ctx_nr;
+        if (ctxn < 0)
+                goto errout;
+retry:
+        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                unclone_ctx(ctx);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        if (!ctx) {
-                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
-                __perf_event_init_context(ctx, task);
                get_ctx(ctx);
-                if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+                if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
                        /*
                         * We raced with some other task; use
                         * the context they set.
                         */
+                        put_task_struct(task);
                        kfree(ctx);
                        goto retry;
                }
-                get_task_struct(task);
        }
-        put_task_struct(task);
        return ctx;
- errout:
+errout:
-        put_task_struct(task);
        return ERR_PTR(err);
 }
@@ -1898,21 +2169,23 @@ static void free_event_rcu(struct rcu_head *head)
        kfree(event);
 }
-static void perf_pending_sync(struct perf_event *event);
 static void perf_buffer_put(struct perf_buffer *buffer);
 static void free_event(struct perf_event *event)
 {
-        perf_pending_sync(event);
+        irq_work_sync(&event->pending);
        if (!event->parent) {
-                atomic_dec(&nr_events);
+                if (event->attach_state & PERF_ATTACH_TASK)
+                        jump_label_dec(&perf_task_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
                        atomic_dec(&nr_comm_events);
                if (event->attr.task)
                        atomic_dec(&nr_task_events);
+                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                        put_callchain_buffers();
        }
        if (event->buffer) {
@@ -1923,7 +2196,9 @@ static void free_event(struct perf_event *event)
        if (event->destroy)
                event->destroy(event);
-        put_ctx(event->ctx);
+        if (event->ctx)
+                put_ctx(event->ctx);
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -2342,6 +2617,9 @@ int perf_event_task_disable(void)
 static int perf_event_index(struct perf_event *event)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
@@ -2845,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event)
        }
 }
-/*
+static void perf_pending_event(struct irq_work *entry)
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-static void perf_pending_event(struct perf_pending_entry *entry)
 {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
@@ -2870,99 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
        }
 }
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-        PENDING_TAIL,
-};
-static void perf_pending_queue(struct perf_pending_entry *entry,
-                               void (*func)(struct perf_pending_entry *))
-{
-        struct perf_pending_entry **head;
-        if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-                return;
-        entry->func = func;
-        head = &get_cpu_var(perf_pending_head);
-        do {
-                entry->next = *head;
-        } while (cmpxchg(head, entry->next, entry) != entry->next);
-        set_perf_event_pending();
-        put_cpu_var(perf_pending_head);
-}
-static int __perf_pending_run(void)
-{
-        struct perf_pending_entry *list;
-        int nr = 0;
-        list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-        while (list != PENDING_TAIL) {
-                void (*func)(struct perf_pending_entry *);
-                struct perf_pending_entry *entry = list;
-                list = list->next;
-                func = entry->func;
-                entry->next = NULL;
-                /*
-                 * Ensure we observe the unqueue before we issue the wakeup,
-                 * so that we won't be waiting forever.
-                 * -- see perf_not_pending().
-                 */
-                smp_wmb();
-                func(entry);
-                nr++;
-        }
-        return nr;
-}
-static inline int perf_not_pending(struct perf_event *event)
-{
-        /*
-         * If we flush on whatever cpu we run, there is a chance we don't
-         * need to wait.
-         */
-        get_cpu();
-        __perf_pending_run();
-        put_cpu();
-        /*
-         * Ensure we see the proper queue state before going to sleep
-         * so that we do not miss the wakeup. -- see perf_pending_handle()
-         */
-        smp_rmb();
-        return event->pending.next == NULL;
-}
-static void perf_pending_sync(struct perf_event *event)
-{
-        wait_event(event->waitq, perf_not_pending(event));
-}
-void perf_event_do_pending(void)
-{
-        __perf_pending_run();
-}
-/*
- * Callchain support -- arch specific
- */
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-        return NULL;
-}
 /*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
@@ -3012,8 +3188,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
        if (handle->nmi) {
                handle->event->pending_wakeup = 1;
-                perf_pending_queue(&handle->event->pending,
+                irq_work_queue(&handle->event->pending);
-                                   perf_pending_event);
        } else
                perf_event_wakeup(handle->event);
 }
@@ -3069,7 +3244,7 @@ again:
        if (handle->wakeup != local_read(&buffer->wakeup))
                perf_output_wakeup(handle);
- out:
+out:
        preempt_enable();
 }
@@ -3457,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
        struct perf_output_handle handle;
        struct perf_event_header header;
+        /* protect the callchain buffers */
+        rcu_read_lock();
        perf_prepare_sample(&header, data, event, regs);
        if (perf_output_begin(&handle, event, header.size, nmi, 1))
-                return;
+                goto exit;
        perf_output_sample(&handle, &header, data, event);
        perf_output_end(&handle);
+exit:
+        rcu_read_unlock();
 }
 /*
@@ -3578,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx = task_event->task_ctx;
+        struct perf_event_context *ctx;
+        struct pmu *pmu;
+        int ctxn;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_task_ctx(&cpuctx->ctx, task_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        if (!ctx)
+                perf_event_task_ctx(&cpuctx->ctx, task_event);
-                ctx = rcu_dereference(current->perf_event_ctxp);
-        if (ctx)
+                ctx = task_event->task_ctx;
-                perf_event_task_ctx(ctx, task_event);
+                if (!ctx) {
-        put_cpu_var(perf_cpu_context);
+                        ctxn = pmu->task_ctx_nr;
+                        if (ctxn < 0)
+                                goto next;
+                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                }
+                if (ctx)
+                        perf_event_task_ctx(ctx, task_event);
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
 }
@@ -3692,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-        unsigned int size;
        char comm[TASK_COMM_LEN];
+        unsigned int size;
+        struct pmu *pmu;
+        int ctxn;
        memset(comm, 0, sizeof(comm));
        strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3705,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        ctx = rcu_dereference(current->perf_event_ctxp);
+                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-        if (ctx)
-                perf_event_comm_ctx(ctx, comm_event);
+                ctxn = pmu->task_ctx_nr;
-        put_cpu_var(perf_cpu_context);
+                if (ctxn < 0)
+                        goto next;
+                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                if (ctx)
+                        perf_event_comm_ctx(ctx, comm_event);
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
 }
 void perf_event_comm(struct task_struct *task)
 {
        struct perf_comm_event comm_event;
+        struct perf_event_context *ctx;
+        int ctxn;
-        if (task->perf_event_ctxp)
+        for_each_task_context_nr(ctxn) {
-                perf_event_enable_on_exec(task);
+                ctx = task->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
+                perf_event_enable_on_exec(ctx);
+        }
        if (!atomic_read(&nr_comm_events))
                return;
@@ -3821,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        char tmp[16];
        char *buf = NULL;
        const char *name;
+        struct pmu *pmu;
+        int ctxn;
        memset(tmp, 0, sizeof(tmp));
@@ -3873,12 +4084,23 @@ got_name:
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        ctx = rcu_dereference(current->perf_event_ctxp);
+                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
-        if (ctx)
+                                        vma->vm_flags & VM_EXEC);
-                perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
-        put_cpu_var(perf_cpu_context);
+                ctxn = pmu->task_ctx_nr;
+                if (ctxn < 0)
+                        goto next;
+                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                if (ctx) {
+                        perf_event_mmap_ctx(ctx, mmap_event,
+                                        vma->vm_flags & VM_EXEC);
+                }
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
        kfree(buf);
@@ -3960,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
-        throttle = (throttle && event->pmu->unthrottle != NULL);
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -4004,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                event->pending_kill = POLL_HUP;
                if (nmi) {
                        event->pending_disable = 1;
-                        perf_pending_queue(&event->pending,
+                        irq_work_queue(&event->pending);
-                                           perf_pending_event);
                } else
                        perf_event_disable(event);
        }
@@ -4029,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
 * Generic software event infrastructure
 */
+struct swevent_htable {
+        struct swevent_hlist            *swevent_hlist;
+        struct mutex                    hlist_mutex;
+        int                             hlist_refcount;
+        /* Recursion avoidance in each contexts */
+        int                             recursion[PERF_NR_CONTEXTS];
+};
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 /*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
@@ -4086,7 +4316,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        }
 }
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
                               int nmi, struct perf_sample_data *data,
                               struct pt_regs *regs)
 {
@@ -4112,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;
@@ -4158,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 /* For the read side: events when they trigger */
 static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
 {
        struct swevent_hlist *hlist;
-        hlist = rcu_dereference(ctx->swevent_hlist);
+        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;
@@ -4171,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 /* For the event head insertion and removal in the hlist */
 static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 {
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
@@ -4182,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
-        hlist = rcu_dereference_protected(ctx->swevent_hlist,
+        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;
@@ -4195,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-        struct perf_cpu_context *cpuctx;
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct perf_event *event;
        struct hlist_node *node;
        struct hlist_head *head;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
        rcu_read_lock();
+        head = find_swevent_head_rcu(swhash, type, event_id);
-        head = find_swevent_head_rcu(cpuctx, type, event_id);
        if (!head)
                goto end;
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
-                        perf_swevent_add(event, nr, nmi, data, regs);
+                        perf_swevent_event(event, nr, nmi, data, regs);
        }
 end:
        rcu_read_unlock();
@@ -4219,33 +4448,17 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
-        int rctx;
-        if (in_nmi())
-                rctx = 3;
-        else if (in_irq())
-                rctx = 2;
-        else if (in_softirq())
-                rctx = 1;
-        else
-                rctx = 0;
-        if (cpuctx->recursion[rctx])
-                return -1;
-        cpuctx->recursion[rctx]++;
-        barrier();
-        return rctx;
+        return get_recursion_context(swhash->recursion);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 void inline perf_swevent_put_recursion_context(int rctx)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
-        barrier();
-        cpuctx->recursion[rctx]--;
+        put_recursion_context(swhash->recursion, rctx);
 }
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4271,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
 {
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
-        struct perf_cpu_context *cpuctx;
        struct hlist_head *head;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
-        head = find_swevent_head(cpuctx, event);
+        hwc->state = !(flags & PERF_EF_START);
+        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
@@ -4293,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event)
        return 0;
 }
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
 {
        hlist_del_rcu(&event->hlist_entry);
 }
-static void perf_swevent_void(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
 {
+        event->hw.state = 0;
 }
-static int perf_swevent_int(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
 {
-        return 0;
+        event->hw.state = PERF_HES_STOPPED;
 }
-static const struct pmu perf_ops_generic = {
-        .enable         = perf_swevent_enable,
-        .disable        = perf_swevent_disable,
-        .start          = perf_swevent_int,
-        .stop           = perf_swevent_void,
-        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-/*
- * hrtimer based swevent callback
- */
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-        enum hrtimer_restart ret = HRTIMER_RESTART;
-        struct perf_sample_data data;
-        struct pt_regs *regs;
-        struct perf_event *event;
-        u64 period;
-        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-        event->pmu->read(event);
-        perf_sample_data_init(&data, 0);
-        data.period = event->hw.last_period;
-        regs = get_irq_regs();
-        if (regs && !perf_exclude_event(event, regs)) {
-                if (!(event->attr.exclude_idle && current->pid == 0))
-                        if (perf_event_overflow(event, 0, &data, regs))
-                                ret = HRTIMER_NORESTART;
-        }
-        period = max_t(u64, 10000, event->hw.sample_period);
-        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-        return ret;
-}
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                u64 period;
-                if (hwc->remaining) {
-                        if (hwc->remaining < 0)
-                                period = 10000;
-                        else
-                                period = hwc->remaining;
-                        hwc->remaining = 0;
-                } else {
-                        period = max_t(u64, 10000, hwc->sample_period);
-                }
-                __hrtimer_start_range_ns(&hwc->hrtimer,
-                                ns_to_ktime(period), 0,
-                                HRTIMER_MODE_REL, 0);
-        }
-}
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        if (hwc->sample_period) {
-                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-                hwc->remaining = ktime_to_ns(remaining);
-                hrtimer_cancel(&hwc->hrtimer);
-        }
-}
-/*
- * Software event: cpu wall time clock
- */
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-        int cpu = raw_smp_processor_id();
-        s64 prev;
-        u64 now;
-        now = cpu_clock(cpu);
-        prev = local64_xchg(&event->hw.prev_count, now);
-        local64_add(now - prev, &event->count);
-}
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        int cpu = raw_smp_processor_id();
-        local64_set(&hwc->prev_count, cpu_clock(cpu));
-        perf_swevent_start_hrtimer(event);
-        return 0;
-}
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-        perf_swevent_cancel_hrtimer(event);
-        cpu_clock_perf_event_update(event);
-}
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-        cpu_clock_perf_event_update(event);
-}
-static const struct pmu perf_ops_cpu_clock = {
-        .enable         = cpu_clock_perf_event_enable,
-        .disable        = cpu_clock_perf_event_disable,
-        .read           = cpu_clock_perf_event_read,
-};
-/*
- * Software event: task time clock
- */
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-        u64 prev;
-        s64 delta;
-        prev = local64_xchg(&event->hw.prev_count, now);
-        delta = now - prev;
-        local64_add(delta, &event->count);
-}
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        u64 now;
-        now = event->ctx->time;
-        local64_set(&hwc->prev_count, now);
-        perf_swevent_start_hrtimer(event);
-        return 0;
-}
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-        perf_swevent_cancel_hrtimer(event);
-        task_clock_perf_event_update(event, event->ctx->time);
-}
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-        u64 time;
-        if (!in_nmi()) {
-                update_context_time(event->ctx);
-                time = event->ctx->time;
-        } else {
-                u64 now = perf_clock();
-                u64 delta = now - event->ctx->timestamp;
-                time = event->ctx->time + delta;
-        }
-        task_clock_perf_event_update(event, time);
-}
-static const struct pmu perf_ops_task_clock = {
-        .enable         = task_clock_perf_event_enable,
-        .disable        = task_clock_perf_event_disable,
-        .read           = task_clock_perf_event_read,
-};
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
 {
-        return rcu_dereference_protected(cpuctx->swevent_hlist,
+        return rcu_dereference_protected(swhash->swevent_hlist,
-                                         lockdep_is_held(&cpuctx->hlist_mutex));
+                                         lockdep_is_held(&swhash->hlist_mutex));
 }
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4499,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
        kfree(hlist);
 }
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
 {
-        struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
        if (!hlist)
                return;
-        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+        rcu_assign_pointer(swhash->swevent_hlist, NULL);
        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        if (!--cpuctx->hlist_refcount)
+        if (!--swhash->hlist_refcount)
-                swevent_hlist_release(cpuctx);
+                swevent_hlist_release(swhash);
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
 }
 static void swevent_hlist_put(struct perf_event *event)
@@ -4537,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event)
 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4550,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                        err = -ENOMEM;
                        goto exit;
                }
-                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-        cpuctx->hlist_refcount++;
+        swhash->hlist_refcount++;
- exit:
+exit:
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
        return err;
 }
@@ -4578,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event)
        put_online_cpus();
        return 0;
- fail:
+fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
@@ -4589,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event)
        return err;
 }
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+        u64 event_id = event->attr.config;
+        WARN_ON(event->parent);
+        jump_label_dec(&perf_swevent_enabled[event_id]);
+        swevent_hlist_put(event);
+}
+static int perf_swevent_init(struct perf_event *event)
+{
+        int event_id = event->attr.config;
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        switch (event_id) {
+        case PERF_COUNT_SW_CPU_CLOCK:
+        case PERF_COUNT_SW_TASK_CLOCK:
+                return -ENOENT;
+        default:
+                break;
+        }
+        if (event_id > PERF_COUNT_SW_MAX)
+                return -ENOENT;
+        if (!event->parent) {
+                int err;
-static const struct pmu perf_ops_tracepoint = {
+                err = swevent_hlist_get(event);
-        .enable         = perf_trace_enable,
+                if (err)
-        .disable        = perf_trace_disable,
+                        return err;
-        .start          = perf_swevent_int,
-        .stop           = perf_swevent_void,
+                jump_label_inc(&perf_swevent_enabled[event_id]);
+                event->destroy = sw_perf_event_destroy;
+        }
+        return 0;
+}
+static struct pmu perf_swevent = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = perf_swevent_init,
+        .add            = perf_swevent_add,
+        .del            = perf_swevent_del,
+        .start          = perf_swevent_start,
+        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_void,
 };
+#ifdef CONFIG_EVENT_TRACING
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
@@ -4643,7 +4728,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
-                        perf_swevent_add(event, count, 1, &data, regs);
+                        perf_swevent_event(event, count, 1, &data, regs);
        }
        perf_swevent_put_recursion_context(rctx);
@@ -4655,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
        perf_trace_destroy(event);
 }
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
 {
        int err;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -ENOENT;
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4666,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
                        perf_paranoid_tracepoint_raw() &&
                        !capable(CAP_SYS_ADMIN))
-                return ERR_PTR(-EPERM);
+                return -EPERM;
        err = perf_trace_init(event);
        if (err)
-                return NULL;
+                return err;
        event->destroy = tp_perf_event_destroy;
-        return &perf_ops_tracepoint;
+        return 0;
+}
+static struct pmu perf_tracepoint = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = perf_tp_event_init,
+        .add            = perf_trace_add,
+        .del            = perf_trace_del,
+        .start          = perf_swevent_start,
+        .stop           = perf_swevent_stop,
+        .read           = perf_swevent_read,
+};
+static inline void perf_tp_register(void)
+{
+        perf_pmu_register(&perf_tracepoint);
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4702,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event)
 #else
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
 {
-        return NULL;
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4719,105 +4822,389 @@ static void perf_event_free_filter(struct perf_event *event)
 #endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-        release_bp_slot(event);
+        struct perf_sample_data sample;
+        struct pt_regs *regs = data;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        if (!bp->hw.state && !perf_exclude_event(bp, regs))
+                perf_swevent_event(bp, 1, 1, &sample, regs);
 }
+#endif
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+/*
+ * hrtimer based swevent callback
+ */
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-        int err;
+        enum hrtimer_restart ret = HRTIMER_RESTART;
+        struct perf_sample_data data;
+        struct pt_regs *regs;
+        struct perf_event *event;
+        u64 period;
-        err = register_perf_hw_breakpoint(bp);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-        if (err)
+        event->pmu->read(event);
-                return ERR_PTR(err);
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
+        regs = get_irq_regs();
+        if (regs && !perf_exclude_event(event, regs)) {
+                if (!(event->attr.exclude_idle && current->pid == 0))
+                        if (perf_event_overflow(event, 0, &data, regs))
+                                ret = HRTIMER_NORESTART;
+        }
-        bp->destroy = bp_perf_event_destroy;
+        period = max_t(u64, 10000, event->hw.sample_period);
+        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-        return &perf_ops_bp;
+        return ret;
 }
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
-        struct perf_sample_data sample;
+        struct hw_perf_event *hwc = &event->hw;
-        struct pt_regs *regs = data;
-        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swevent_hrtimer;
+        if (hwc->sample_period) {
+                s64 period = local64_read(&hwc->period_left);
+                if (period) {
+                        if (period < 0)
+                                period = 10000;
+                        local64_set(&hwc->period_left, 0);
+                } else {
+                        period = max_t(u64, 10000, hwc->sample_period);
+                }
+                __hrtimer_start_range_ns(&hwc->hrtimer,
+                                ns_to_ktime(period), 0,
+                                HRTIMER_MODE_REL_PINNED, 0);
+        }
+}
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
-        if (!perf_exclude_event(bp, regs))
+        if (hwc->sample_period) {
-                perf_swevent_add(bp, 1, 1, &sample, regs);
+                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+                local64_set(&hwc->period_left, ktime_to_ns(remaining));
+                hrtimer_cancel(&hwc->hrtimer);
+        }
 }
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+/*
+ * Software event: cpu wall time clock
+ */
+static void cpu_clock_event_update(struct perf_event *event)
 {
-        return NULL;
+        s64 prev;
+        u64 now;
+        now = local_clock();
+        prev = local64_xchg(&event->hw.prev_count, now);
+        local64_add(now - prev, &event->count);
 }
-void perf_bp_event(struct perf_event *bp, void *regs)
+static void cpu_clock_event_start(struct perf_event *event, int flags)
 {
+        local64_set(&event->hw.prev_count, local_clock());
+        perf_swevent_start_hrtimer(event);
 }
-#endif
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+        perf_swevent_cancel_hrtimer(event);
+        cpu_clock_event_update(event);
+}
-static void sw_perf_event_destroy(struct perf_event *event)
+static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
-        u64 event_id = event->attr.config;
+        if (flags & PERF_EF_START)
+                cpu_clock_event_start(event, flags);
-        WARN_ON(event->parent);
+        return 0;
+}
-        atomic_dec(&perf_swevent_enabled[event_id]);
+static void cpu_clock_event_del(struct perf_event *event, int flags)
-        swevent_hlist_put(event);
+{
+        cpu_clock_event_stop(event, flags);
 }
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static void cpu_clock_event_read(struct perf_event *event)
 {
-        const struct pmu *pmu = NULL;
+        cpu_clock_event_update(event);
-        u64 event_id = event->attr.config;
+}
+static int cpu_clock_event_init(struct perf_event *event)
+{
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+                return -ENOENT;
+        return 0;
+}
+static struct pmu perf_cpu_clock = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = cpu_clock_event_init,
+        .add            = cpu_clock_event_add,
+        .del            = cpu_clock_event_del,
+        .start          = cpu_clock_event_start,
+        .stop           = cpu_clock_event_stop,
+        .read           = cpu_clock_event_read,
+};
+/*
+ * Software event: task time clock
+ */
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+        u64 prev;
+        s64 delta;
+        prev = local64_xchg(&event->hw.prev_count, now);
+        delta = now - prev;
+        local64_add(delta, &event->count);
+}
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+        local64_set(&event->hw.prev_count, event->ctx->time);
+        perf_swevent_start_hrtimer(event);
+}
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+        perf_swevent_cancel_hrtimer(event);
+        task_clock_event_update(event, event->ctx->time);
+}
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+        if (flags & PERF_EF_START)
+                task_clock_event_start(event, flags);
+        return 0;
+}
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+        task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+static void task_clock_event_read(struct perf_event *event)
+{
+        u64 time;
+        if (!in_nmi()) {
+                update_context_time(event->ctx);
+                time = event->ctx->time;
+        } else {
+                u64 now = perf_clock();
+                u64 delta = now - event->ctx->timestamp;
+                time = event->ctx->time + delta;
+        }
+        task_clock_event_update(event, time);
+}
+static int task_clock_event_init(struct perf_event *event)
+{
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+                return -ENOENT;
+        return 0;
+}
+static struct pmu perf_task_clock = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = task_clock_event_init,
+        .add            = task_clock_event_add,
+        .del            = task_clock_event_del,
+        .start          = task_clock_event_start,
+        .stop           = task_clock_event_stop,
+        .read           = task_clock_event_read,
+};
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+        return 0;
+}
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+        perf_pmu_disable(pmu);
+}
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+        perf_pmu_enable(pmu);
+        return 0;
+}
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+        perf_pmu_enable(pmu);
+}
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
+{
+        struct pmu *pmu;
+        if (ctxn < 0)
+                return NULL;
+        list_for_each_entry(pmu, &pmus, entry) {
+                if (pmu->task_ctx_nr == ctxn)
+                        return pmu->pmu_cpu_context;
+        }
+        return NULL;
+}
+static void free_pmu_context(void * __percpu cpu_context)
+{
+        struct pmu *pmu;
+        mutex_lock(&pmus_lock);
        /*
-         * Software events (currently) can't in general distinguish
+         * Like a real lame refcount.
-         * between user, kernel and hypervisor events.
-         * However, context switches and cpu migrations are considered
-         * to be kernel events, and page faults are never hypervisor
-         * events.
         */
-        switch (event_id) {
+        list_for_each_entry(pmu, &pmus, entry) {
-        case PERF_COUNT_SW_CPU_CLOCK:
+                if (pmu->pmu_cpu_context == cpu_context)
-                pmu = &perf_ops_cpu_clock;
+                        goto out;
+        }
-                break;
+        free_percpu(cpu_context);
-        case PERF_COUNT_SW_TASK_CLOCK:
+out:
-                /*
+        mutex_unlock(&pmus_lock);
-                 * If the user instantiates this as a per-cpu event,
+}
-                 * use the cpu_clock event instead.
-                 */
-                if (event->ctx->task)
-                        pmu = &perf_ops_task_clock;
-                else
-                        pmu = &perf_ops_cpu_clock;
-                break;
+int perf_pmu_register(struct pmu *pmu)
-        case PERF_COUNT_SW_PAGE_FAULTS:
+{
-        case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+        int cpu, ret;
-        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-        case PERF_COUNT_SW_CONTEXT_SWITCHES:
-        case PERF_COUNT_SW_CPU_MIGRATIONS:
-        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-        case PERF_COUNT_SW_EMULATION_FAULTS:
-                if (!event->parent) {
-                        int err;
-                        err = swevent_hlist_get(event);
-                        if (err)
-                                return ERR_PTR(err);
-                        atomic_inc(&perf_swevent_enabled[event_id]);
+        mutex_lock(&pmus_lock);
-                        event->destroy = sw_perf_event_destroy;
+        ret = -ENOMEM;
+        pmu->pmu_disable_count = alloc_percpu(int);
+        if (!pmu->pmu_disable_count)
+                goto unlock;
+        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+        if (pmu->pmu_cpu_context)
+                goto got_cpu_context;
+        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+        if (!pmu->pmu_cpu_context)
+                goto free_pdc;
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                __perf_event_init_context(&cpuctx->ctx);
+                cpuctx->ctx.type = cpu_context;
+                cpuctx->ctx.pmu = pmu;
+                cpuctx->jiffies_interval = 1;
+                INIT_LIST_HEAD(&cpuctx->rotation_list);
+        }
+got_cpu_context:
+        if (!pmu->start_txn) {
+                if (pmu->pmu_enable) {
+                        /*
+                         * If we have pmu_enable/pmu_disable calls, install
+                         * transaction stubs that use that to try and batch
+                         * hardware accesses.
+                         */
+                        pmu->start_txn  = perf_pmu_start_txn;
+                        pmu->commit_txn = perf_pmu_commit_txn;
+                        pmu->cancel_txn = perf_pmu_cancel_txn;
+                } else {
+                        pmu->start_txn  = perf_pmu_nop_void;
+                        pmu->commit_txn = perf_pmu_nop_int;
+                        pmu->cancel_txn = perf_pmu_nop_void;
+                }
+        }
+        if (!pmu->pmu_enable) {
+                pmu->pmu_enable  = perf_pmu_nop_void;
+                pmu->pmu_disable = perf_pmu_nop_void;
+        }
+        list_add_rcu(&pmu->entry, &pmus);
+        ret = 0;
+unlock:
+        mutex_unlock(&pmus_lock);
+        return ret;
+free_pdc:
+        free_percpu(pmu->pmu_disable_count);
+        goto unlock;
+}
+void perf_pmu_unregister(struct pmu *pmu)
+{
+        mutex_lock(&pmus_lock);
+        list_del_rcu(&pmu->entry);
+        mutex_unlock(&pmus_lock);
+        /*
+         * We dereference the pmu list under both SRCU and regular RCU, so
+         * synchronize against both of those.
+         */
+        synchronize_srcu(&pmus_srcu);
+        synchronize_rcu();
+        free_percpu(pmu->pmu_disable_count);
+        free_pmu_context(pmu->pmu_cpu_context);
+}
+struct pmu *perf_init_event(struct perf_event *event)
+{
+        struct pmu *pmu = NULL;
+        int idx;
+        idx = srcu_read_lock(&pmus_srcu);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                int ret = pmu->event_init(event);
+                if (!ret)
+                        goto unlock;
+                if (ret != -ENOENT) {
+                        pmu = ERR_PTR(ret);
+                        goto unlock;
                }
-                pmu = &perf_ops_generic;
-                break;
        }
+        pmu = ERR_PTR(-ENOENT);
+unlock:
+        srcu_read_unlock(&pmus_srcu, idx);
        return pmu;
 }
@@ -4826,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 * Allocate and initialize a event structure
 */
 static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
-                   int cpu,
+                 struct task_struct *task,
-                   struct perf_event_context *ctx,
+                 struct perf_event *group_leader,
-                   struct perf_event *group_leader,
+                 struct perf_event *parent_event,
-                   struct perf_event *parent_event,
+                 perf_overflow_handler_t overflow_handler)
-                   perf_overflow_handler_t overflow_handler,
+{
-                   gfp_t gfpflags)
+        struct pmu *pmu;
-{
-        const struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err;
-        event = kzalloc(sizeof(*event), gfpflags);
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);
@@ -4857,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr,
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        init_waitqueue_head(&event->waitq);
+        init_irq_work(&event->pending, perf_pending_event);
        mutex_init(&event->mmap_mutex);
@@ -4864,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->attr             = *attr;
        event->group_leader     = group_leader;
        event->pmu              = NULL;
-        event->ctx              = ctx;
        event->oncpu            = -1;
        event->parent           = parent_event;
@@ -4874,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
+        if (task) {
+                event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+                /*
+                 * hw_breakpoint is a bit difficult here..
+                 */
+                if (attr->type == PERF_TYPE_BREAKPOINT)
+                        event->hw.bp_target = task;
+#endif
+        }
        if (!overflow_handler && parent_event)
                overflow_handler = parent_event->overflow_handler;
        
@@ -4898,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr,
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto done;
-        switch (attr->type) {
+        pmu = perf_init_event(event);
-        case PERF_TYPE_RAW:
-        case PERF_TYPE_HARDWARE:
-        case PERF_TYPE_HW_CACHE:
-                pmu = hw_perf_event_init(event);
-                break;
-        case PERF_TYPE_SOFTWARE:
-                pmu = sw_perf_event_init(event);
-                break;
-        case PERF_TYPE_TRACEPOINT:
-                pmu = tp_perf_event_init(event);
-                break;
-        case PERF_TYPE_BREAKPOINT:
-                pmu = bp_perf_event_init(event);
-                break;
-        default:
-                break;
-        }
 done:
        err = 0;
        if (!pmu)
@@ -4938,13 +5313,21 @@ done:
        event->pmu = pmu;
        if (!event->parent) {
-                atomic_inc(&nr_events);
+                if (event->attach_state & PERF_ATTACH_TASK)
+                        jump_label_inc(&perf_task_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
                        atomic_inc(&nr_comm_events);
                if (event->attr.task)
                        atomic_inc(&nr_task_events);
+                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                        err = get_callchain_buffers();
+                        if (err) {
+                                free_event(event);
+                                return ERR_PTR(err);
+                        }
+                }
        }
        return event;
@@ -5092,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-        struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+        struct perf_event *group_leader = NULL, *output_event = NULL;
+        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct file *group_file = NULL;
+        struct task_struct *task = NULL;
+        struct pmu *pmu;
        int event_fd;
+        int move_group = 0;
        int fput_needed = 0;
        int err;
@@ -5123,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open,
        if (event_fd < 0)
                return event_fd;
-        /*
-         * Get the target context (task or percpu):
-         */
-        ctx = find_get_context(pid, cpu);
-        if (IS_ERR(ctx)) {
-                err = PTR_ERR(ctx);
-                goto err_fd;
-        }
        if (group_fd != -1) {
                group_leader = perf_fget_light(group_fd, &fput_needed);
                if (IS_ERR(group_leader)) {
                        err = PTR_ERR(group_leader);
-                        goto err_put_context;
+                        goto err_fd;
                }
                group_file = group_leader->filp;
                if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5145,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open,
                        group_leader = NULL;
        }
+        if (pid != -1) {
+                task = find_lively_task_by_vpid(pid);
+                if (IS_ERR(task)) {
+                        err = PTR_ERR(task);
+                        goto err_group_fd;
+                }
+        }
+        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+        if (IS_ERR(event)) {
+                err = PTR_ERR(event);
+                goto err_task;
+        }
+        /*
+         * Special case software events and allow them to be part of
+         * any hardware group.
+         */
+        pmu = event->pmu;
+        if (group_leader &&
+            (is_software_event(event) != is_software_event(group_leader))) {
+                if (is_software_event(event)) {
+                        /*
+                         * If event and group_leader are not both a software
+                         * event, and event is, then group leader is not.
+                         *
+                         * Allow the addition of software events to !software
+                         * groups, this is safe because software events never
+                         * fail to schedule.
+                         */
+                        pmu = group_leader->pmu;
+                } else if (is_software_event(group_leader) &&
+                           (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                        /*
+                         * In case the group is a pure software group, and we
+                         * try to add a hardware event, move the whole group to
+                         * the hardware context.
+                         */
+                        move_group = 1;
+                }
+        }
+        /*
+         * Get the target context (task or percpu):
+         */
+        ctx = find_get_context(pmu, task, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_alloc;
+        }
        /*
         * Look up the group leader (we will attach this event to it):
         */
@@ -5156,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open,
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
-                        goto err_put_context;
+                        goto err_context;
                /*
                 * Do not allow to attach to a group in a different
                 * task or CPU context:
                 */
-                if (group_leader->ctx != ctx)
+                if (move_group) {
-                        goto err_put_context;
+                        if (group_leader->ctx->type != ctx->type)
+                                goto err_context;
+                } else {
+                        if (group_leader->ctx != ctx)
+                                goto err_context;
+                }
                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
-                        goto err_put_context;
+                        goto err_context;
-        }
-        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                     NULL, NULL, GFP_KERNEL);
-        if (IS_ERR(event)) {
-                err = PTR_ERR(event);
-                goto err_put_context;
        }
        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
-                        goto err_free_put_context;
+                        goto err_context;
        }
        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
-                goto err_free_put_context;
+                goto err_context;
+        }
+        if (move_group) {
+                struct perf_event_context *gctx = group_leader->ctx;
+                mutex_lock(&gctx->mutex);
+                perf_event_remove_from_context(group_leader);
+                list_for_each_entry(sibling, &group_leader->sibling_list,
+                                    group_entry) {
+                        perf_event_remove_from_context(sibling);
+                        put_ctx(gctx);
+                }
+                mutex_unlock(&gctx->mutex);
+                put_ctx(gctx);
        }
        event->filp = event_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
+        if (move_group) {
+                perf_install_in_context(ctx, group_leader, cpu);
+                get_ctx(ctx);
+                list_for_each_entry(sibling, &group_leader->sibling_list,
+                                    group_entry) {
+                        perf_install_in_context(ctx, sibling, cpu);
+                        get_ctx(ctx);
+                }
+        }
        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
        mutex_unlock(&ctx->mutex);
@@ -5212,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open,
        fd_install(event_fd, event_file);
        return event_fd;
-err_free_put_context:
+err_context:
+        put_ctx(ctx);
+err_alloc:
        free_event(event);
-err_put_context:
+err_task:
+        if (task)
+                put_task_struct(task);
+err_group_fd:
        fput_light(group_file, fput_needed);
-        put_ctx(ctx);
 err_fd:
        put_unused_fd(event_fd);
        return err;
@@ -5227,32 +5685,31 @@ err_fd:
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
 */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                 pid_t pid,
+                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler)
 {
-        struct perf_event *event;
        struct perf_event_context *ctx;
+        struct perf_event *event;
        int err;
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pid, cpu);
+        event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
-        if (IS_ERR(ctx)) {
-                err = PTR_ERR(ctx);
-                goto err_exit;
-        }
-        event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                 NULL, overflow_handler, GFP_KERNEL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
-                goto err_put_context;
+                goto err;
+        }
+        ctx = find_get_context(event->pmu, task, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_free;
        }
        event->filp = NULL;
@@ -5270,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        return event;
- err_put_context:
+err_free:
-        put_ctx(ctx);
+        free_event(event);
- err_exit:
+err:
        return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-              struct task_struct *parent,
-              struct perf_event_context *parent_ctx,
-              struct task_struct *child,
-              struct perf_event *group_leader,
-              struct perf_event_context *child_ctx)
-{
-        struct perf_event *child_event;
-        /*
-         * Instead of creating recursive hierarchies of events,
-         * we link inherited events back to the original parent,
-         * which has a filp for sure, which we use as the reference
-         * count:
-         */
-        if (parent_event->parent)
-                parent_event = parent_event->parent;
-        child_event = perf_event_alloc(&parent_event->attr,
-                                           parent_event->cpu, child_ctx,
-                                           group_leader, parent_event,
-                                           NULL, GFP_KERNEL);
-        if (IS_ERR(child_event))
-                return child_event;
-        get_ctx(child_ctx);
-        /*
-         * Make the child state follow the state of the parent event,
-         * not its attr.disabled bit.  We hold the parent's mutex,
-         * so we won't race with perf_event_{en, dis}able_family.
-         */
-        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-                child_event->state = PERF_EVENT_STATE_INACTIVE;
-        else
-                child_event->state = PERF_EVENT_STATE_OFF;
-        if (parent_event->attr.freq) {
-                u64 sample_period = parent_event->hw.sample_period;
-                struct hw_perf_event *hwc = &child_event->hw;
-                hwc->sample_period = sample_period;
-                hwc->last_period   = sample_period;
-                local64_set(&hwc->period_left, sample_period);
-        }
-        child_event->overflow_handler = parent_event->overflow_handler;
-        /*
-         * Link it up in the child's context:
-         */
-        add_event_to_ctx(child_event, child_ctx);
-        /*
-         * Get a reference to the parent filp - we will fput it
-         * when the child event exits. This is safe to do because
-         * we are in the parent and we know that the filp still
-         * exists and has a nonzero count:
-         */
-        atomic_long_inc(&parent_event->filp->f_count);
-        /*
-         * Link this into the parent event's child list
-         */
-        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-        mutex_lock(&parent_event->child_mutex);
-        list_add_tail(&child_event->child_list, &parent_event->child_list);
-        mutex_unlock(&parent_event->child_mutex);
-        return child_event;
-}
-static int inherit_group(struct perf_event *parent_event,
-              struct task_struct *parent,
-              struct perf_event_context *parent_ctx,
-              struct task_struct *child,
-              struct perf_event_context *child_ctx)
-{
-        struct perf_event *leader;
-        struct perf_event *sub;
-        struct perf_event *child_ctr;
-        leader = inherit_event(parent_event, parent, parent_ctx,
-                                 child, NULL, child_ctx);
-        if (IS_ERR(leader))
-                return PTR_ERR(leader);
-        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-                child_ctr = inherit_event(sub, parent, parent_ctx,
-                                            child, leader, child_ctx);
-                if (IS_ERR(child_ctr))
-                        return PTR_ERR(child_ctr);
-        }
-        return 0;
-}
 static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
 {
@@ -5432,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event,
        }
 }
-/*
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
 {
        struct perf_event *child_event, *tmp;
        struct perf_event_context *child_ctx;
        unsigned long flags;
-        if (likely(!child->perf_event_ctxp)) {
+        if (likely(!child->perf_event_ctxp[ctxn])) {
                perf_event_task(child, NULL, 0);
                return;
        }
@@ -5453,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child)
         * scheduled, so we are now safe from rescheduling changing
         * our context.
         */
-        child_ctx = child->perf_event_ctxp;
+        child_ctx = child->perf_event_ctxp[ctxn];
-        __perf_event_task_sched_out(child_ctx);
+        task_ctx_sched_out(child_ctx, EVENT_ALL);
        /*
         * Take the context lock here so that if find_get_context is
@@ -5462,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child)
         * incremented the context's refcount before we do put_ctx below.
         */
        raw_spin_lock(&child_ctx->lock);
-        child->perf_event_ctxp = NULL;
+        child->perf_event_ctxp[ctxn] = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
@@ -5515,6 +5870,17 @@ again:
        put_ctx(child_ctx);
 }
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+        int ctxn;
+        for_each_task_context_nr(ctxn)
+                perf_event_exit_task_context(child, ctxn);
+}
 static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
 {
@@ -5536,48 +5902,166 @@ static void perf_free_event(struct perf_event *event,
 /*
 * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
 */
 void perf_event_free_task(struct task_struct *task)
 {
-        struct perf_event_context *ctx = task->perf_event_ctxp;
+        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
+        int ctxn;
-        if (!ctx)
+        for_each_task_context_nr(ctxn) {
-                return;
+                ctx = task->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
-        mutex_lock(&ctx->mutex);
+                mutex_lock(&ctx->mutex);
 again:
-        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+                list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
-                perf_free_event(event, ctx);
+                                group_entry)
+                        perf_free_event(event, ctx);
-        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+                list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                                 group_entry)
+                                group_entry)
-                perf_free_event(event, ctx);
+                        perf_free_event(event, ctx);
-        if (!list_empty(&ctx->pinned_groups) ||
+                if (!list_empty(&ctx->pinned_groups) ||
-            !list_empty(&ctx->flexible_groups))
+                                !list_empty(&ctx->flexible_groups))
-                goto again;
+                        goto again;
-        mutex_unlock(&ctx->mutex);
+                mutex_unlock(&ctx->mutex);
-        put_ctx(ctx);
+                put_ctx(ctx);
+        }
+}
+void perf_event_delayed_put(struct task_struct *task)
+{
+        int ctxn;
+        for_each_task_context_nr(ctxn)
+                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+              struct task_struct *parent,
+              struct perf_event_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_event *group_leader,
+              struct perf_event_context *child_ctx)
+{
+        struct perf_event *child_event;
+        unsigned long flags;
+        /*
+         * Instead of creating recursive hierarchies of events,
+         * we link inherited events back to the original parent,
+         * which has a filp for sure, which we use as the reference
+         * count:
+         */
+        if (parent_event->parent)
+                parent_event = parent_event->parent;
+        child_event = perf_event_alloc(&parent_event->attr,
+                                           parent_event->cpu,
+                                           child,
+                                           group_leader, parent_event,
+                                           NULL);
+        if (IS_ERR(child_event))
+                return child_event;
+        get_ctx(child_ctx);
+        /*
+         * Make the child state follow the state of the parent event,
+         * not its attr.disabled bit.  We hold the parent's mutex,
+         * so we won't race with perf_event_{en, dis}able_family.
+         */
+        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+                child_event->state = PERF_EVENT_STATE_INACTIVE;
+        else
+                child_event->state = PERF_EVENT_STATE_OFF;
+        if (parent_event->attr.freq) {
+                u64 sample_period = parent_event->hw.sample_period;
+                struct hw_perf_event *hwc = &child_event->hw;
+                hwc->sample_period = sample_period;
+                hwc->last_period   = sample_period;
+                local64_set(&hwc->period_left, sample_period);
+        }
+        child_event->ctx = child_ctx;
+        child_event->overflow_handler = parent_event->overflow_handler;
+        /*
+         * Link it up in the child's context:
+         */
+        raw_spin_lock_irqsave(&child_ctx->lock, flags);
+        add_event_to_ctx(child_event, child_ctx);
+        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+        /*
+         * Get a reference to the parent filp - we will fput it
+         * when the child event exits. This is safe to do because
+         * we are in the parent and we know that the filp still
+         * exists and has a nonzero count:
+         */
+        atomic_long_inc(&parent_event->filp->f_count);
+        /*
+         * Link this into the parent event's child list
+         */
+        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+        mutex_lock(&parent_event->child_mutex);
+        list_add_tail(&child_event->child_list, &parent_event->child_list);
+        mutex_unlock(&parent_event->child_mutex);
+        return child_event;
+}
+static int inherit_group(struct perf_event *parent_event,
+              struct task_struct *parent,
+              struct perf_event_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_event_context *child_ctx)
+{
+        struct perf_event *leader;
+        struct perf_event *sub;
+        struct perf_event *child_ctr;
+        leader = inherit_event(parent_event, parent, parent_ctx,
+                                 child, NULL, child_ctx);
+        if (IS_ERR(leader))
+                return PTR_ERR(leader);
+        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+                child_ctr = inherit_event(sub, parent, parent_ctx,
+                                            child, leader, child_ctx);
+                if (IS_ERR(child_ctr))
+                        return PTR_ERR(child_ctr);
+        }
+        return 0;
 }
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
-                   struct task_struct *child,
+                   struct task_struct *child, int ctxn,
                   int *inherited_all)
 {
        int ret;
-        struct perf_event_context *child_ctx = child->perf_event_ctxp;
+        struct perf_event_context *child_ctx;
        if (!event->attr.inherit) {
                *inherited_all = 0;
                return 0;
        }
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
@@ -5586,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                 * child.
                 */
-                child_ctx = kzalloc(sizeof(struct perf_event_context),
+                child_ctx = alloc_perf_context(event->pmu, child);
-                                    GFP_KERNEL);
                if (!child_ctx)
                        return -ENOMEM;
-                __perf_event_init_context(child_ctx, child);
+                child->perf_event_ctxp[ctxn] = child_ctx;
-                child->perf_event_ctxp = child_ctx;
-                get_task_struct(child);
        }
        ret = inherit_group(event, parent, parent_ctx,
@@ -5605,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
        return ret;
 }
 /*
 * Initialize the perf_event context in task_struct
 */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
@@ -5618,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child)
        int inherited_all = 1;
        int ret = 0;
-        child->perf_event_ctxp = NULL;
+        child->perf_event_ctxp[ctxn] = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
-        if (likely(!parent->perf_event_ctxp))
+        if (likely(!parent->perf_event_ctxp[ctxn]))
                return 0;
        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
-        parent_ctx = perf_pin_task_context(parent);
+        parent_ctx = perf_pin_task_context(parent, ctxn);
        /*
         * No need to check if parent_ctx != NULL here; since we saw
@@ -5650,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child)
         * the list, not manipulating it:
         */
        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-                ret = inherit_task_group(event, parent, parent_ctx, child,
+                ret = inherit_task_group(event, parent, parent_ctx,
-                                         &inherited_all);
+                                         child, ctxn, &inherited_all);
                if (ret)
                        break;
        }
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-                ret = inherit_task_group(event, parent, parent_ctx, child,
+                ret = inherit_task_group(event, parent, parent_ctx,
-                                         &inherited_all);
+                                         child, ctxn, &inherited_all);
                if (ret)
                        break;
        }
-        child_ctx = child->perf_event_ctxp;
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (child_ctx && inherited_all) {
                /*
@@ -5692,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child)
        return ret;
 }
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+        int ctxn, ret;
+        for_each_task_context_nr(ctxn) {
+                ret = perf_event_init_context(child, ctxn);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
 static void __init perf_event_init_all_cpus(void)
 {
+        struct swevent_htable *swhash;
        int cpu;
-        struct perf_cpu_context *cpuctx;
        for_each_possible_cpu(cpu) {
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                swhash = &per_cpu(swevent_htable, cpu);
-                mutex_init(&cpuctx->hlist_mutex);
+                mutex_init(&swhash->hlist_mutex);
-                __perf_event_init_context(&cpuctx->ctx, NULL);
+                INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
        }
 }
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
-        struct perf_cpu_context *cpuctx;
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        cpuctx = &per_cpu(perf_cpu_context, cpu);
-        spin_lock(&perf_resource_lock);
+        mutex_lock(&swhash->hlist_mutex);
-        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
+        if (swhash->hlist_refcount > 0) {
-        spin_unlock(&perf_resource_lock);
-        mutex_lock(&cpuctx->hlist_mutex);
-        if (cpuctx->hlist_refcount > 0) {
                struct swevent_hlist *hlist;
-                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
-                WARN_ON_ONCE(!hlist);
+                WARN_ON(!hlist);
-                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_event_exit_cpu(void *info)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-        struct perf_event_context *ctx = &cpuctx->ctx;
+        WARN_ON(!irqs_disabled());
+        list_del_init(&cpuctx->rotation_list);
+}
+static void __perf_event_exit_context(void *__info)
+{
+        struct perf_event_context *ctx = __info;
        struct perf_event *event, *tmp;
+        perf_pmu_rotate_stop(ctx->pmu);
        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
                __perf_event_remove_from_context(event);
        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                __perf_event_remove_from_context(event);
 }
+static void perf_event_exit_cpu_context(int cpu)
+{
+        struct perf_event_context *ctx;
+        struct pmu *pmu;
+        int idx;
+        idx = srcu_read_lock(&pmus_srcu);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+                mutex_lock(&ctx->mutex);
+                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+                mutex_unlock(&ctx->mutex);
+        }
+        srcu_read_unlock(&pmus_srcu, idx);
+}
 static void perf_event_exit_cpu(int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        struct perf_event_context *ctx = &cpuctx->ctx;
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        swevent_hlist_release(cpuctx);
+        swevent_hlist_release(swhash);
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
-        mutex_lock(&ctx->mutex);
+        perf_event_exit_cpu_context(cpu);
-        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-        mutex_unlock(&ctx->mutex);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
@@ -5778,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-        .notifier_call          = perf_cpu_notify,
-        .priority               = 20,
-};
 void __init perf_event_init(void)
 {
        perf_event_init_all_cpus();
-        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+        init_srcu_struct(&pmus_srcu);
-                        (void *)(long)smp_processor_id());
+        perf_pmu_register(&perf_swevent);
-        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+        perf_pmu_register(&perf_cpu_clock);
-                        (void *)(long)smp_processor_id());
+        perf_pmu_register(&perf_task_clock);
-        register_cpu_notifier(&perf_cpu_nb);
+        perf_tp_register();
-}
+        perf_cpu_notifier(perf_cpu_notify);
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-                                        struct sysdev_class_attribute *attr,
-                                        char *buf)
-{
-        return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-                        struct sysdev_class_attribute *attr,
-                        const char *buf,
-                        size_t count)
-{
-        struct perf_cpu_context *cpuctx;
-        unsigned long val;
-        int err, cpu, mpt;
-        err = strict_strtoul(buf, 10, &val);
-        if (err)
-                return err;
-        if (val > perf_max_events)
-                return -EINVAL;
-        spin_lock(&perf_resource_lock);
-        perf_reserved_percpu = val;
-        for_each_online_cpu(cpu) {
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
-                raw_spin_lock_irq(&cpuctx->ctx.lock);
-                mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-                          perf_max_events - perf_reserved_percpu);
-                cpuctx->max_pertask = mpt;
-                raw_spin_unlock_irq(&cpuctx->ctx.lock);
-        }
-        spin_unlock(&perf_resource_lock);
-        return count;
-}
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
-                                    struct sysdev_class_attribute *attr,
-                                    char *buf)
-{
-        return sprintf(buf, "%d\n", perf_overcommit);
-}
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class,
-                    struct sysdev_class_attribute *attr,
-                    const char *buf, size_t count)
-{
-        unsigned long val;
-        int err;
-        err = strict_strtoul(buf, 10, &val);
-        if (err)
-                return err;
-        if (val > 1)
-                return -EINVAL;
-        spin_lock(&perf_resource_lock);
-        perf_overcommit = val;
-        spin_unlock(&perf_resource_lock);
-        return count;
-}
-static SYSDEV_CLASS_ATTR(
-                                reserve_percpu,
-                                0644,
-                                perf_show_reserve_percpu,
-                                perf_set_reserve_percpu
-                        );
-static SYSDEV_CLASS_ATTR(
-                                overcommit,
-                                0644,
-                                perf_show_overcommit,
-                                perf_set_overcommit
-                        );
-static struct attribute *perfclass_attrs[] = {
-        &attr_reserve_percpu.attr,
-        &attr_overcommit.attr,
-        NULL
-};
-static struct attribute_group perfclass_attr_group = {
-        .attrs                  = perfclass_attrs,
-        .name                   = "perf_events",
-};
-static int __init perf_event_sysfs_init(void)
-{
-        return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-                                  &perfclass_attr_group);
 }
-device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference_check(pid->tasks[type].first,
+                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              rcu_read_lock_held() ||
                                              lockdep_tasklist_lock_is_held());
                if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
+        rcu_lockdep_assert(rcu_read_lock_held());
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..29bff6117abc 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP
        depends on SMP
        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
        depends on PM_SLEEP
+        select HOTPLUG
        select HOTPLUG_CPU
        default y
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+        select LZO_COMPRESS
+        select LZO_DECOMPRESS
        select SUSPEND_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
@@ -242,3 +245,17 @@ config PM_OPS
        bool
        depends on PM_SLEEP || PM_RUNTIME
        default y
+config PM_OPP
+        bool "Operating Performance Point (OPP) Layer library"
+        depends on PM
+        ---help---
+          SOCs have a standard set of tuples consisting of frequency and
+          voltage pairs that the device will support per voltage domain. This
+          is called Operating Performance Point or OPP. The actual definitions
+          of OPP varies over silicon within the same family of devices.
+          OPP layer organizes the data internally using device pointers
+          representing individual voltage domains and provides SOC
+          implementations a ready to use framework to manage OPPs.
+          For more information, read <file:Documentation/power/opp.txt>
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..657272e91d0a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -29,6 +29,7 @@
 #include "power.h"
+static int nocompress = 0;
 static int noresume = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
@@ -638,6 +639,8 @@ int hibernate(void)
                if (hibernation_mode == HIBERNATION_PLATFORM)
                        flags |= SF_PLATFORM_MODE;
+                if (nocompress)
+                        flags |= SF_NOCOMPRESS_MODE;
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
@@ -705,7 +708,7 @@ static int software_resume(void)
                goto Unlock;
        }
-        pr_debug("PM: Checking image partition %s\n", resume_file);
+        pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
@@ -730,10 +733,10 @@ static int software_resume(void)
        }
 Check_image:
-        pr_debug("PM: Resume from partition %d:%d\n",
+        pr_debug("PM: Hibernation image partition %d:%d present\n",
                MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
-        pr_debug("PM: Checking hibernation image.\n");
+        pr_debug("PM: Looking for hibernation image.\n");
        error = swsusp_check();
        if (error)
                goto Unlock;
@@ -765,14 +768,14 @@ static int software_resume(void)
                goto Done;
        }
-        pr_debug("PM: Reading hibernation image.\n");
+        pr_debug("PM: Loading hibernation image.\n");
        error = swsusp_read(&flags);
        swsusp_close(FMODE_READ);
        if (!error)
                hibernation_restore(flags & SF_PLATFORM_MODE);
-        printk(KERN_ERR "PM: Restore failed, recovering.\n");
+        printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
        swsusp_free();
        thaw_processes();
 Done:
@@ -785,7 +788,7 @@ static int software_resume(void)
        /* For success case, the suspend path will release the lock */
 Unlock:
        mutex_unlock(&pm_mutex);
-        pr_debug("PM: Resume from disk failed.\n");
+        pr_debug("PM: Hibernation image not present or could not be loaded.\n");
        return error;
 close_finish:
        swsusp_close(FMODE_READ);
@@ -1004,6 +1007,15 @@ static int __init resume_offset_setup(char *str)
        return 1;
 }
+static int __init hibernate_setup(char *str)
+{
+        if (!strncmp(str, "noresume", 8))
+                noresume = 1;
+        else if (!strncmp(str, "nocompress", 10))
+                nocompress = 1;
+        return 1;
+}
 static int __init noresume_setup(char *str)
 {
        noresume = 1;
@@ -1013,3 +1025,4 @@ static int __init noresume_setup(char *str)
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
+__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 62b0bc6e4983..7b5db6a8561e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -237,18 +237,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                char *buf)
 {
-        unsigned long val;
+        unsigned int val;
-        return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+        return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
 }
 static ssize_t wakeup_count_store(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                const char *buf, size_t n)
 {
-        unsigned long val;
+        unsigned int val;
-        if (sscanf(buf, "%lu", &val) == 1) {
+        if (sscanf(buf, "%u", &val) == 1) {
                if (pm_save_wakeup_count(val))
                        return n;
        }
@@ -281,12 +281,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 power_attr(pm_trace);
+static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       char *buf)
+{
+        return show_trace_dev_match(buf, PAGE_SIZE);
+}
+static ssize_t
+pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
+                         const char *buf, size_t n)
+{
+        return -EINVAL;
+}
+power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
+        &pm_trace_dev_match_attr.attr,
 #endif
 #ifdef CONFIG_PM_SLEEP
        &pm_async_attr.attr,
@@ -308,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
 static int __init pm_start_workqueue(void)
 {
-        pm_wq = create_freezeable_workqueue("pm");
+        pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
        return pm_wq ? 0 : -ENOMEM;
 }
@@ -321,6 +339,7 @@ static int __init pm_init(void)
        int error = pm_start_workqueue();
        if (error)
                return error;
+        hibernate_image_size_init();
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..03634be55f62 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,9 @@ struct swsusp_info {
 } __attribute__((aligned(PAGE_SIZE)));
 #ifdef CONFIG_HIBERNATION
+/* kernel/power/snapshot.c */
+extern void __init hibernate_image_size_init(void);
 #ifdef CONFIG_ARCH_HIBERNATION_HEADER
 /* Maximum size of architecture specific data in a hibernation header */
 #define MAX_ARCH_HEADER_SIZE    (sizeof(struct new_utsname) + 4)
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
-#endif
+#else /* !CONFIG_HIBERNATION */
+static inline void hibernate_image_size_init(void) {}
+#endif /* !CONFIG_HIBERNATION */
 extern int pfn_is_nosave(unsigned long);
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void);
 * the image header.
 */
 #define SF_PLATFORM_MODE        1
+#define SF_NOCOMPRESS_MODE      2
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 028a99598f49..e50b4c1b2a0f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only)
        struct timeval start, end;
        u64 elapsed_csecs64;
        unsigned int elapsed_csecs;
+        bool wakeup = false;
        do_gettimeofday(&start);
@@ -78,6 +79,11 @@ static int try_to_freeze_tasks(bool sig_only)
                if (!todo || time_after(jiffies, end_time))
                        break;
+                if (!pm_check_wakeup_events()) {
+                        wakeup = true;
+                        break;
+                }
                /*
                 * We need to retry, but first give the freezing tasks some
                 * time to enter the regrigerator.
@@ -97,8 +103,9 @@ static int try_to_freeze_tasks(bool sig_only)
                 * but it cleans up leftover PF_FREEZE requests.
                 */
                printk("\n");
-                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
+                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+                       wakeup ? "aborted" : "failed",
                       elapsed_csecs / 100, elapsed_csecs % 100,
                       todo - wq_busy, wq_busy);
@@ -107,7 +114,7 @@ static int try_to_freeze_tasks(bool sig_only)
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
-                        if (freezing(p) && !freezer_should_skip(p))
+                        if (!wakeup && freezing(p) && !freezer_should_skip(p))
                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d3f795f01bbc..ac7eb109f196 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *);
 * size will not exceed N bytes, but if that is impossible, it will
 * try to create the smallest image possible.
 */
-unsigned long image_size = 500 * 1024 * 1024;
+unsigned long image_size;
+void __init hibernate_image_size_init(void)
+{
+        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+}
 /* List of PBEs needed for restoring the pages that were allocated before
 * the suspend and included in the suspend image, but have also been
@@ -1318,12 +1323,14 @@ int hibernate_preallocate_memory(void)
        /* Compute the maximum number of saveable pages to leave in memory. */
        max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+        /* Compute the desired number of image pages specified by image_size. */
        size = DIV_ROUND_UP(image_size, PAGE_SIZE);
        if (size > max_size)
                size = max_size;
        /*
-         * If the maximum is not less than the current number of saveable pages
+         * If the desired number of image pages is at least as large as the
-         * in memory, allocate page frames for the image and we're done.
+         * current number of saveable pages in memory, allocate page frames for
+         * the image and we're done.
         */
        if (size >= saveable) {
                pages = preallocate_image_highmem(save_highmem);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e6a5bdf61a37..916eaa790399 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -24,10 +24,12 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/slab.h>
+#include <linux/lzo.h>
+#include <linux/vmalloc.h>
 #include "power.h"
-#define SWSUSP_SIG      "S1SUSPEND"
+#define HIBERNATE_SIG   "LINHIB0001"
 /*
 *      The swap map is a data structure used for keeping track of each page
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
        if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
                memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
-                memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
+                memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
                error = hib_bio_write_page(swsusp_resume_block,
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
        return error;
 }
+/* We need to remember how much compressed data we need to read. */
+#define LZO_HEADER      sizeof(size_t)
+/* Number of pages/bytes we'll compress at one time. */
+#define LZO_UNC_PAGES   32
+#define LZO_UNC_SIZE    (LZO_UNC_PAGES * PAGE_SIZE)
+/* Number of pages/bytes we need for compressed data (worst case). */
+#define LZO_CMP_PAGES   DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
+                                     LZO_HEADER, PAGE_SIZE)
+#define LZO_CMP_SIZE    (LZO_CMP_PAGES * PAGE_SIZE)
 /**
 *      save_image - save the suspend image data
 */
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle,
        return ret;
 }
+/**
+ * save_image_lzo - Save the suspend image data compressed with LZO.
+ * @handle: Swap mam handle to use for saving the image.
+ * @snapshot: Image to read data from.
+ * @nr_to_write: Number of pages to save.
+ */
+static int save_image_lzo(struct swap_map_handle *handle,
+                          struct snapshot_handle *snapshot,
+                          unsigned int nr_to_write)
+{
+        unsigned int m;
+        int ret = 0;
+        int nr_pages;
+        int err2;
+        struct bio *bio;
+        struct timeval start;
+        struct timeval stop;
+        size_t off, unc_len, cmp_len;
+        unsigned char *unc, *cmp, *wrk, *page;
+        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                return -ENOMEM;
+        }
+        wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
+        if (!wrk) {
+                printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        unc = vmalloc(LZO_UNC_SIZE);
+        if (!unc) {
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                vfree(wrk);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        cmp = vmalloc(LZO_CMP_SIZE);
+        if (!cmp) {
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                vfree(unc);
+                vfree(wrk);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        printk(KERN_INFO
+                "PM: Compressing and saving image data (%u pages) ...     ",
+                nr_to_write);
+        m = nr_to_write / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        bio = NULL;
+        do_gettimeofday(&start);
+        for (;;) {
+                for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+                        ret = snapshot_read_next(snapshot);
+                        if (ret < 0)
+                                goto out_finish;
+                        if (!ret)
+                                break;
+                        memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+                        if (!(nr_pages % m))
+                                printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+                if (!off)
+                        break;
+                unc_len = off;
+                ret = lzo1x_1_compress(unc, unc_len,
+                                       cmp + LZO_HEADER, &cmp_len, wrk);
+                if (ret < 0) {
+                        printk(KERN_ERR "PM: LZO compression failed\n");
+                        break;
+                }
+                if (unlikely(!cmp_len ||
+                             cmp_len > lzo1x_worst_compress(unc_len))) {
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        ret = -1;
+                        break;
+                }
+                *(size_t *)cmp = cmp_len;
+                /*
+                 * Given we are writing one page at a time to disk, we copy
+                 * that much from the buffer, although the last bit will likely
+                 * be smaller than full page. This is OK - we saved the length
+                 * of the compressed data, so any garbage at the end will be
+                 * discarded when we read it.
+                 */
+                for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                        memcpy(page, cmp + off, PAGE_SIZE);
+                        ret = swap_write_page(handle, page, &bio);
+                        if (ret)
+                                goto out_finish;
+                }
+        }
+out_finish:
+        err2 = hib_wait_on_bio_chain(&bio);
+        do_gettimeofday(&stop);
+        if (!ret)
+                ret = err2;
+        if (!ret)
+                printk(KERN_CONT "\b\b\b\bdone\n");
+        else
+                printk(KERN_CONT "\n");
+        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+        vfree(cmp);
+        vfree(unc);
+        vfree(wrk);
+        free_page((unsigned long)page);
+        return ret;
+}
 /**
 *      enough_swap - Make sure we have enough swap to save the image.
 *
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle,
 *      space avaiable from the resume partition.
 */
-static int enough_swap(unsigned int nr_pages)
+static int enough_swap(unsigned int nr_pages, unsigned int flags)
 {
        unsigned int free_swap = count_swap_pages(root_swap, 1);
+        unsigned int required;
        pr_debug("PM: Free swap pages: %u\n", free_swap);
-        return free_swap > nr_pages + PAGES_FOR_IO;
+        required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
+                nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+        return free_swap > required;
 }
 /**
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags)
                printk(KERN_ERR "PM: Cannot get swap writer\n");
                return error;
        }
-        if (!enough_swap(pages)) {
+        if (!enough_swap(pages, flags)) {
                printk(KERN_ERR "PM: Not enough free swap\n");
                error = -ENOSPC;
                goto out_finish;
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags)
        }
        header = (struct swsusp_info *)data_of(snapshot);
        error = swap_write_page(&handle, header, NULL);
-        if (!error)
+        if (!error) {
-                error = save_image(&handle, &snapshot, pages - 1);
+                error = (flags & SF_NOCOMPRESS_MODE) ?
+                        save_image(&handle, &snapshot, pages - 1) :
+                        save_image_lzo(&handle, &snapshot, pages - 1);
+        }
 out_finish:
        error = swap_writer_finish(&handle, flags, error);
        return error;
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle,
 }
 /**
+ * load_image_lzo - Load compressed image data and decompress them with LZO.
+ * @handle: Swap map handle to use for loading data.
+ * @snapshot: Image to copy uncompressed data into.
+ * @nr_to_read: Number of pages to load.
+ */
+static int load_image_lzo(struct swap_map_handle *handle,
+                          struct snapshot_handle *snapshot,
+                          unsigned int nr_to_read)
+{
+        unsigned int m;
+        int error = 0;
+        struct timeval start;
+        struct timeval stop;
+        unsigned nr_pages;
+        size_t off, unc_len, cmp_len;
+        unsigned char *unc, *cmp, *page;
+        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                return -ENOMEM;
+        }
+        unc = vmalloc(LZO_UNC_SIZE);
+        if (!unc) {
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        cmp = vmalloc(LZO_CMP_SIZE);
+        if (!cmp) {
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                vfree(unc);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        printk(KERN_INFO
+                "PM: Loading and decompressing image data (%u pages) ...     ",
+                nr_to_read);
+        m = nr_to_read / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        do_gettimeofday(&start);
+        error = snapshot_write_next(snapshot);
+        if (error <= 0)
+                goto out_finish;
+        for (;;) {
+                error = swap_read_page(handle, page, NULL); /* sync */
+                if (error)
+                        break;
+                cmp_len = *(size_t *)page;
+                if (unlikely(!cmp_len ||
+                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        error = -1;
+                        break;
+                }
+                memcpy(cmp, page, PAGE_SIZE);
+                for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                        error = swap_read_page(handle, page, NULL); /* sync */
+                        if (error)
+                                goto out_finish;
+                        memcpy(cmp + off, page, PAGE_SIZE);
+                }
+                unc_len = LZO_UNC_SIZE;
+                error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
+                                              unc, &unc_len);
+                if (error < 0) {
+                        printk(KERN_ERR "PM: LZO decompression failed\n");
+                        break;
+                }
+                if (unlikely(!unc_len ||
+                             unc_len > LZO_UNC_SIZE ||
+                             unc_len & (PAGE_SIZE - 1))) {
+                        printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
+                        error = -1;
+                        break;
+                }
+                for (off = 0; off < unc_len; off += PAGE_SIZE) {
+                        memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                        error = snapshot_write_next(snapshot);
+                        if (error <= 0)
+                                goto out_finish;
+                }
+        }
+out_finish:
+        do_gettimeofday(&stop);
+        if (!error) {
+                printk("\b\b\b\bdone\n");
+                snapshot_write_finalize(snapshot);
+                if (!snapshot_image_loaded(snapshot))
+                        error = -ENODATA;
+        } else
+                printk("\n");
+        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+        vfree(cmp);
+        vfree(unc);
+        free_page((unsigned long)page);
+        return error;
+}
+/**
 *      swsusp_read - read the hibernation image.
 *      @flags_p: flags passed by the "frozen" kernel in the image header should
 *                be written into this memeory location
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p)
                goto end;
        if (!error)
                error = swap_read_page(&handle, header, NULL);
-        if (!error)
+        if (!error) {
-                error = load_image(&handle, &snapshot, header->pages - 1);
+                error = (*flags_p & SF_NOCOMPRESS_MODE) ?
+                        load_image(&handle, &snapshot, header->pages - 1) :
+                        load_image_lzo(&handle, &snapshot, header->pages - 1);
+        }
        swap_reader_finish(&handle);
 end:
        if (!error)
@@ -640,7 +916,7 @@ int swsusp_check(void)
                if (error)
                        goto put;
-                if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
+                if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
                        memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
                        /* Reset swap signature now */
                        error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +929,13 @@ put:
                if (error)
                        blkdev_put(hib_resume_bdev, FMODE_READ);
                else
-                        pr_debug("PM: Signature found, resuming\n");
+                        pr_debug("PM: Image signature found, resuming\n");
        } else {
                error = PTR_ERR(hib_resume_bdev);
        }
        if (error)
-                pr_debug("PM: Error %d checking image file\n", error);
+                pr_debug("PM: Image not found (code %d)\n", error);
        return error;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008a..2531017795f6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
 * provides serialisation for access to the entire console
 * driver system.
 */
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
@@ -556,7 +556,7 @@ static void zap_locks(void)
        /* If a crash is occurring, make sure we can't deadlock */
        spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
-        init_MUTEX(&console_sem);
+        sema_init(&console_sem, 1);
 }
 #if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 /**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
 *
 * Check for bottom half being disabled, which covers both the
 * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation.  This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
 *
 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
-        return in_softirq();
+        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Forward declarations for rcutiny_plugin.h. */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __call_rcu(struct rcu_head *head,
+                       void (*func)(struct rcu_head *rcu),
+                       struct rcu_ctrlblk *rcp);
+#include "rcutiny_plugin.h"
 #ifdef CONFIG_NO_HZ
 static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
+        rcu_preempt_check_callbacks();
 }
 /*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        *rcp->donetail = NULL;
        if (rcp->curtail == rcp->donetail)
                rcp->curtail = &rcp->rcucblist;
+        rcu_preempt_remove_callbacks(rcp);
        rcp->donetail = &rcp->rcucblist;
        local_irq_restore(flags);
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 {
        __rcu_process_callbacks(&rcu_sched_ctrlblk);
        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        rcu_preempt_process_callbacks();
 }
 /*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
 }
 /*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
 * period.  But since we have but one CPU, that would be after any
 * quiescent state.
 */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
        __call_rcu(head, func, &rcu_sched_ctrlblk);
 }
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
 void rcu_barrier_bh(void)
 {
        struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
-#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
 * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
 *
 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#ifdef CONFIG_TINY_PREEMPT_RCU
+#include <linux/delay.h>
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+        struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+        struct rcu_head **nexttail;
+                                /* Tasks blocked in a preemptible RCU */
+                                /*  read-side critical section while an */
+                                /*  preemptible-RCU grace period is in */
+                                /*  progress must wait for a later grace */
+                                /*  period.  This pointer points to the */
+                                /*  ->next pointer of the last task that */
+                                /*  must wait for a later grace period, or */
+                                /*  to &->rcb.rcucblist if there is no */
+                                /*  such task. */
+        struct list_head blkd_tasks;
+                                /* Tasks blocked in RCU read-side critical */
+                                /*  section.  Tasks are placed at the head */
+                                /*  of this list and age towards the tail. */
+        struct list_head *gp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current grace period, or NULL if there */
+                                /*  is not such task. */
+        struct list_head *exp_tasks;
+                                /* Pointer to first task blocking the */
+                                /*  current expedited grace period, or NULL */
+                                /*  if there is no such task.  If there */
+                                /*  is no current expedited grace period, */
+                                /*  then there cannot be any such task. */
+        u8 gpnum;               /* Current grace period. */
+        u8 gpcpu;               /* Last grace period blocked by the CPU. */
+        u8 completed;           /* Last grace period completed. */
+                                /*  If all three are equal, RCU is idle. */
+};
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+        .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+        return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+/*
+ * Check for a running RCU reader.  Because there is only one CPU,
+ * there can be but one running RCU reader at a time.  ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+        return current->rcu_read_lock_nesting;
+}
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+        return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+        return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+        return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+        return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state.  The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time.  Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period.  (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+        /* Record both CPU and task as having responded to current GP. */
+        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        /*
+         * If there is no GP, or if blocked readers are still blocking GP,
+         * then there is nothing more to do.
+         */
+        if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+                return;
+        /* Advance callbacks. */
+        rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+        rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+        rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+        /* If there are no blocked readers, next GP is done instantly. */
+        if (!rcu_preempt_blocked_readers_any())
+                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+        /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+                raise_softirq(RCU_SOFTIRQ);
+}
+/*
+ * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+        if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+                /* Official start of GP. */
+                rcu_preempt_ctrlblk.gpnum++;
+                /* Any blocked RCU readers block new GP. */
+                if (rcu_preempt_blocked_readers_any())
+                        rcu_preempt_ctrlblk.gp_tasks =
+                                rcu_preempt_ctrlblk.blkd_tasks.next;
+                /* If there is no running reader, CPU is done with GP. */
+                if (!rcu_preempt_running_reader())
+                        rcu_preempt_cpu_qs();
+        }
+}
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section.  Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+        struct task_struct *t = current;
+        unsigned long flags;
+        local_irq_save(flags); /* must exclude scheduler_tick(). */
+        if (rcu_preempt_running_reader() &&
+            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+                /* Possibly blocking in an RCU read-side critical section. */
+                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+                /*
+                 * If this CPU has already checked in, then this task
+                 * will hold up the next grace period rather than the
+                 * current grace period.  Queue the task accordingly.
+                 * If the task is queued for the current grace period
+                 * (i.e., this CPU has not yet passed through a quiescent
+                 * state for the current grace period), then as long
+                 * as that task remains queued, the current grace period
+                 * cannot end.
+                 */
+                list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+                if (rcu_cpu_blocking_cur_gp())
+                        rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+        }
+        /*
+         * Either we were not in an RCU read-side critical section to
+         * begin with, or we have now recorded that critical section
+         * globally.  Either way, we can now note a quiescent state
+         * for this CPU.  Again, if we were in an RCU read-side critical
+         * section, and if that critical section was blocking the current
+         * grace period, then the fact that the task has been enqueued
+         * means that current grace period continues to be blocked.
+         */
+        rcu_preempt_cpu_qs();
+        local_irq_restore(flags);
+}
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+        int empty;
+        int empty_exp;
+        unsigned long flags;
+        struct list_head *np;
+        int special;
+        /*
+         * NMI handlers cannot block and cannot safely manipulate state.
+         * They therefore cannot possibly be special, so just leave.
+         */
+        if (in_nmi())
+                return;
+        local_irq_save(flags);
+        /*
+         * If RCU core is waiting for this CPU to exit critical section,
+         * let it know that we have done so.
+         */
+        special = t->rcu_read_unlock_special;
+        if (special & RCU_READ_UNLOCK_NEED_QS)
+                rcu_preempt_cpu_qs();
+        /* Hardware IRQ handlers cannot block. */
+        if (in_irq()) {
+                local_irq_restore(flags);
+                return;
+        }
+        /* Clean up if blocked during RCU read-side critical section. */
+        if (special & RCU_READ_UNLOCK_BLOCKED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+                /*
+                 * Remove this task from the ->blkd_tasks list and adjust
+                 * any pointers that might have been referencing it.
+                 */
+                empty = !rcu_preempt_blocked_readers_cgp();
+                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+                np = t->rcu_node_entry.next;
+                if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                        np = NULL;
+                list_del(&t->rcu_node_entry);
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+                        rcu_preempt_ctrlblk.gp_tasks = np;
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+                        rcu_preempt_ctrlblk.exp_tasks = np;
+                INIT_LIST_HEAD(&t->rcu_node_entry);
+                /*
+                 * If this was the last task on the current list, and if
+                 * we aren't waiting on the CPU, report the quiescent state
+                 * and start a new grace period if needed.
+                 */
+                if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+                        rcu_preempt_cpu_qs();
+                        rcu_preempt_start_gp();
+                }
+                /*
+                 * If this was the last task on the expedited lists,
+                 * then we need wake up the waiting task.
+                 */
+                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+                        rcu_report_exp_done();
+        }
+        local_irq_restore(flags);
+}
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+        --t->rcu_read_lock_nesting;
+        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+        if (t->rcu_read_lock_nesting == 0 &&
+            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere.  This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+        struct task_struct *t = current;
+        if (rcu_preempt_gp_in_progress() &&
+            (!rcu_preempt_running_reader() ||
+             !rcu_cpu_blocking_cur_gp()))
+                rcu_preempt_cpu_qs();
+        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+            rcu_preempt_ctrlblk.rcb.donetail)
+                raise_softirq(RCU_SOFTIRQ);
+        if (rcu_preempt_gp_in_progress() &&
+            rcu_cpu_blocking_cur_gp() &&
+            rcu_preempt_running_reader())
+                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from __rcu_process_callbacks() to
+ * handle that case.  Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+        if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+                rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        unsigned long flags;
+        debug_rcu_head_queue(head);
+        head->func = func;
+        head->next = NULL;
+        local_irq_save(flags);
+        *rcu_preempt_ctrlblk.nexttail = head;
+        rcu_preempt_ctrlblk.nexttail = &head->next;
+        rcu_preempt_start_gp();  /* checks to see if GP needed. */
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+void rcu_barrier(void)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!rcu_scheduler_active)
+                return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+        WARN_ON_ONCE(rcu_preempt_running_reader());
+        if (!rcu_preempt_blocked_readers_any())
+                return;
+        /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+        rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+        return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+        wake_up(&sync_rcu_preempt_exp_wq);
+}
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section.  Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list.  So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+        unsigned long flags;
+        struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+        unsigned long snap;
+        barrier(); /* ensure prior action seen before grace period. */
+        WARN_ON_ONCE(rcu_preempt_running_reader());
+        /*
+         * Acquire lock so that there is only one preemptible RCU grace
+         * period in flight.  Of course, if someone does the expedited
+         * grace period for us while we are acquiring the lock, just leave.
+         */
+        snap = sync_rcu_preempt_exp_count + 1;
+        mutex_lock(&sync_rcu_preempt_exp_mutex);
+        if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+                goto unlock_mb_ret; /* Others did our work for us. */
+        local_irq_save(flags);
+        /*
+         * All RCU readers have to already be on blkd_tasks because
+         * we cannot legally be executing in an RCU read-side critical
+         * section.
+         */
+        /* Snapshot current head of ->blkd_tasks list. */
+        rpcp->exp_tasks = rpcp->blkd_tasks.next;
+        if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+                rpcp->exp_tasks = NULL;
+        local_irq_restore(flags);
+        /* Wait for tail of ->blkd_tasks list to drain. */
+        if (rcu_preempted_readers_exp())
+                wait_event(sync_rcu_preempt_exp_wq,
+                           !rcu_preempted_readers_exp());
+        /* Clean up and exit. */
+        barrier(); /* ensure expedited GP seen before counter increment. */
+        sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+        mutex_unlock(&sync_rcu_preempt_exp_mutex);
+        barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+        if (!rcu_preempt_running_reader())
+                rcu_preempt_cpu_qs();
+        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting == 0)
+                return;
+        t->rcu_read_lock_nesting = 1;
+        rcu_read_unlock();
+}
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
 };
 static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
 static long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #define FULLSTOP_SHUTDOWN 1     /* System shutdown with rcutorture running. */
 #define FULLSTOP_RMMOD    2     /* Normal rmmod of rcutorture. */
 static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex);   /* Protect fullstop transitions and spawning */
+/*
-                                /*  of kthreads. */
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
 /*
 * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
                mdelay(longdelay_ms);
        if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
                udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+                preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
 }
 static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
        delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
        if (!delay)
                schedule_timeout_interruptible(longdelay);
+        else
+                rcu_read_delay(rrsp);
 }
 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
                        continue;
                rp->rtort_pipe_count = 0;
                udelay(rcu_random(&rand) & 0x3ff);
-                old_rp = rcu_torture_current;
+                old_rp = rcu_dereference_check(rcu_torture_current,
+                                               current == writer_task);
                rp->rtort_mbtest = 1;
                rcu_assign_pointer(rcu_torture_current, rp);
                smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        /* OK, time to rat on our buddy... */
+        /*
+         * OK, time to rat on our buddy...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
               rsp->name);
        rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        /*
+         * OK, time to rat on ourselves...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
        trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        long delta;
        struct rcu_node *rnp;
-        if (rcu_cpu_stall_panicking)
+        if (rcu_cpu_stall_suppress)
                return;
-        delta = jiffies - rsp->jiffies_stall;
+        delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
        rnp = rdp->mynode;
-        if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+        if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
-        rcu_cpu_stall_panicking = 1;
+        rcu_cpu_stall_suppress = 1;
        return NOTIFY_DONE;
 }
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+        rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_preempt_stall_reset();
+}
 static struct notifier_block rcu_panic_block = {
        .notifier_call = rcu_panic,
 };
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
+void rcu_cpu_stall_reset(void)
+{
+}
 static void __init check_cpu_stall_init(void)
 {
 }
@@ -712,7 +745,7 @@ static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
 {
        int i;
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
        rsp->orphan_qlen += rdp->qlen;
+        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->qlen = 0;
        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        struct rcu_data *rdp;
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        rdp = rsp->rda[smp_processor_id()];
+        rdp = this_cpu_ptr(rsp->rda);
        if (rsp->orphan_cbs_list == NULL) {
                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
        rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
        rdp->qlen += rsp->orphan_qlen;
+        rdp->n_cbs_adopted += rsp->orphan_qlen;
        rsp->orphan_cbs_list = NULL;
        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
        rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        unsigned long flags;
        unsigned long mask;
        int need_report = 0;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp;
        /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
+        rdp->n_cbs_invoked += count;
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                cpu = rnp->grplo;
                bit = 1;
                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-                        if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+                        if ((rnp->qsmask & bit) != 0 &&
+                            f(per_cpu_ptr(rsp->rda, cpu)))
                                mask |= bit;
                }
                if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * a quiescent state betweentimes.
         */
        local_irq_save(flags);
-        rdp = rsp->rda[smp_processor_id()];
+        rdp = this_cpu_ptr(rsp->rda);
        rcu_process_gp_end(rsp, rdp);
        check_for_new_grace_period(rsp, rdp);
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        int i;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 {
        unsigned long flags;
        unsigned long mask;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 /*
 * Helper function for rcu_init() that initializes one rcu_state structure.
 */
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+                struct rcu_data __percpu *rda)
 {
        static char *buf[] = { "rcu_node_level_0",
                               "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                }
        }
+        rsp->rda = rda;
        rnp = rsp->level[NUM_RCU_LVLS - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
-                rsp->rda[i]->mynode = rnp;
+                per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i, rsp);
        }
 }
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
- * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
-        int i; \
-        \
-        for_each_possible_cpu(i) { \
-                (rsp)->rda[i] = &per_cpu(rcu_data, i); \
-        } \
-        rcu_init_one(rsp); \
-} while (0)
 void __init rcu_init(void)
 {
        int cpu;
        rcu_bootup_announce();
-        RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
+        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
-        RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
        long            qlen;           /* # of queued callbacks */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
+        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
+        unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
 #define RCU_STALL_DELAY_DELTA          0
 #endif
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+                                        RCU_STALL_DELAY_DELTA)
                                                /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
                                                /* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
                                                /*  before ratting on them. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
-#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
-        struct rcu_data *rda[NR_CPUS];          /* array of rdp pointers. */
+        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
        printk(KERN_INFO
               "\tRCU-based detection of stalled CPUs is disabled.\n");
 #endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
-                rdp = rcu_preempt_state.rda[cpu];
+                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
 */
 void __rcu_read_lock(void)
 {
-        ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+        current->rcu_read_lock_nesting++;
        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
        struct task_struct *t = current;
        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-        if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+        --t->rcu_read_lock_nesting;
+        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+        if (t->rcu_read_lock_nesting == 0 &&
            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                rcu_read_unlock_special(t);
 #ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
        }
 }
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
 *
 * Control will return to the caller some time after a full grace
 * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
+ * read-side critical sections have completed.  Note, however, that
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * upon return from synchronize_rcu(), the caller might well be executing
- * and may be nested.
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.  RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 */
 void synchronize_rcu(void)
 {
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
 */
 static void __init __rcu_init_preempt(void)
 {
-        RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
 /*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
 }
 /*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-        call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-/*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
 * But because preemptable RCU does not exist, map to rcu-sched.
 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+        seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 #define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+        seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+        seq_printf(m, ",%lu,%lu,%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 #ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
        seq_puts(m, "\"rcu_preempt:\"\n");
        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_data *rdp;
        for_each_possible_cpu(cpu) {
-                rdp = rsp->rda[cpu];
+                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (rdp->beenonline)
                        print_one_rcu_pending(m, rdp);
        }
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb90832..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        atomic_t rto_count;
-#ifdef CONFIG_SMP
        struct cpupri cpupri;
-#endif
 };
 /*
@@ -437,7 +435,7 @@ struct root_domain {
 */
 static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
 /*
 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle;
+        struct task_struct *curr, *idle, *stop;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
        u64 clock;
+        u64 clock_task;
        atomic_t nr_iowait;
@@ -520,6 +519,10 @@ struct rq {
        u64 avg_idle;
 #endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
        long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
 inline void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update)
+        if (!rq->skip_clock_update) {
-                rq->clock = sched_clock_cpu(cpu_of(rq));
+                int cpu = cpu_of(rq);
+                u64 irq_time;
+                rq->clock = sched_clock_cpu(cpu);
+                irq_time = irq_time_cpu(cpu);
+                if (rq->clock - irq_time > rq->clock_task)
+                        rq->clock_task = rq->clock - irq_time;
+                sched_irq_time_avg_update(rq, irq_time);
+        }
 }
 /*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int i;
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
        if (strncmp(buf, "NO_", 3) == 0) {
                neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        }
        for (i = 0; sched_feat_names[i]; i++) {
-                int len = strlen(sched_feat_names[i]);
+                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                if (strncmp(cmp, sched_feat_names[i], len) == 0) {
                        if (neg)
                                sysctl_sched_features &= ~(1UL << i);
                        else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
-        if (task_has_rt_policy(p)) {
-                p->se.load.weight = 0;
-                p->se.load.inv_weight = WMULT_CONST;
-                return;
-        }
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
        dec_nr_running(rq);
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+void enable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 1;
+}
+void disable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 0;
+}
+static u64 irq_time_cpu(int cpu)
+{
+        if (!sched_clock_irqtime)
+                return 0;
+        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+void account_system_vtime(struct task_struct *curr)
+{
+        unsigned long flags;
+        int cpu;
+        u64 now, delta;
+        if (!sched_clock_irqtime)
+                return;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        now = sched_clock_cpu(cpu);
+        delta = now - per_cpu(irq_start_time, cpu);
+        per_cpu(irq_start_time, cpu) = now;
+        /*
+         * We do not account for softirq time from ksoftirqd here.
+         * We want to continue accounting softirq time to ksoftirqd thread
+         * in that case, so as not to confuse scheduler with a special task
+         * that do not consume any time, but still wants to run.
+         */
+        if (hardirq_count())
+                per_cpu(cpu_hardirq_time, cpu) += delta;
+        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+                per_cpu(cpu_softirq_time, cpu) += delta;
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+                rq->prev_irq_time = curr_irq_time;
+                sched_rt_avg_update(rq, delta_irq);
+        }
+}
+#else
+static u64 irq_time_cpu(int cpu)
+{
+        return 0;
+}
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        struct task_struct *old_stop = cpu_rq(cpu)->stop;
+        if (stop) {
+                /*
+                 * Make it appear like a SCHED_FIFO task, its something
+                 * userspace knows about and won't get confused about.
+                 *
+                 * Also, it will make PI more or less work without too
+                 * much confusion -- but then, stop work should not
+                 * rely on PI working anyway.
+                 */
+                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+                stop->sched_class = &stop_sched_class;
+        }
+        cpu_rq(cpu)->stop = stop;
+        if (old_stop) {
+                /*
+                 * Reset it back to a normal scheduling class so that
+                 * it can die in pieces.
+                 */
+                old_stop->sched_class = &rt_sched_class;
+        }
+}
 /*
 * __normal_prio - return the priority that is based on the static prio
 */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        if (p->sched_class != &fair_sched_class)
                return 0;
+        if (unlikely(p->policy == SCHED_IDLE))
+                return 0;
        /*
         * Buddy candidates are cache hot:
         */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
-        if (likely(!mm)) {
+        if (!mm) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (likely(!prev->mm)) {
+        if (!prev->mm) {
                prev->active_mm = NULL;
                rq->prev_mm = oldmm;
        }
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
        if (task_current(rq, p)) {
                update_rq_clock(rq);
-                ns = rq->clock - p->se.exec_start;
+                ns = rq->clock_task - p->se.exec_start;
                if ((s64)ns < 0)
                        ns = 0;
        }
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
-        else if (softirq_count())
+        else if (in_serving_softirq())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3584,7 +3714,7 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick(curr);
+        perf_event_task_tick();
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
                        return p;
        }
-        class = sched_class_highest;
+        for_each_class(class) {
-        for ( ; ; ) {
                p = class->pick_next_task(rq);
                if (p)
                        return p;
-                /*
-                 * Will never be NULL as the idle class always
-                 * returns a non-NULL p:
-                 */
-                class = class->next;
        }
+        BUG(); /* the idle class will always have a runnable task */
 }
 /*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        rq = task_rq_lock(p, &flags);
+        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
@@ -4645,7 +4772,7 @@ recheck:
        }
        if (user) {
-                retval = security_task_setscheduler(p, policy, param);
+                retval = security_task_setscheduler(p);
                if (retval)
                        return retval;
        }
@@ -4661,6 +4788,15 @@ recheck:
         */
        rq = __task_rq_lock(p);
+        /*
+         * Changing the policy of the stop threads its a very bad idea
+         */
+        if (p == rq->stop) {
+                __task_rq_unlock(rq);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                return -EINVAL;
+        }
 #ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
                /*
@@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
-        retval = security_task_setscheduler(p, 0, NULL);
+        retval = security_task_setscheduler(p);
        if (retval)
                goto out_unlock;
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
        retval = set_cpus_allowed_ptr(p, new_mask);
        if (!retval) {
@@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->se.exec_start = sched_clock();
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+        /*
+         * We're having a chicken and egg problem, even though we are
+         * holding rq->lock, the cpu isn't yet set to this cpu so the
+         * lockdep check in task_group() will fail.
+         *
+         * Similar case to sched_fork(). / Alternatively we could
+         * use task_rq_lock() here and obtain the other rq->lock.
+         *
+         * Silence PROVE_RCU
+         */
+        rcu_read_lock();
        __set_task_cpu(idle, cpu);
+        rcu_read_unlock();
        rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6662,7 @@ struct s_data {
        cpumask_var_t           nodemask;
        cpumask_var_t           this_sibling_map;
        cpumask_var_t           this_core_map;
+        cpumask_var_t           this_book_map;
        cpumask_var_t           send_covered;
        cpumask_var_t           tmpmask;
        struct sched_group      **sched_group_nodes;
@@ -6525,6 +6674,7 @@ enum s_alloc {
        sa_rootdomain,
        sa_tmpmask,
        sa_send_covered,
+        sa_this_book_map,
        sa_this_core_map,
        sa_this_sibling_map,
        sa_nodemask,
@@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *mask)
 {
        int group;
+#ifdef CONFIG_SCHED_SMT
        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
+#else
+        group = cpu;
+#endif
        if (sg)
                *sg = &per_cpu(sched_group_core, group).sg;
        return group;
 }
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
-                  struct sched_group **sg, struct cpumask *unused)
+                  struct sched_group **sg, struct cpumask *mask)
 {
+        int group = cpu;
+#ifdef CONFIG_SCHED_MC
+        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#endif
        if (sg)
-                *sg = &per_cpu(sched_group_core, cpu).sg;
+                *sg = &per_cpu(sched_group_book, group).sg;
-        return cpu;
+        return group;
 }
-#endif
+#endif /* CONFIG_SCHED_BOOK */
 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *mask)
 {
        int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+        cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_MC
 SD_INIT_FUNC(MC)
 #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
 static int default_relax_domain_level = -1;
@@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                free_cpumask_var(d->tmpmask); /* fall through */
        case sa_send_covered:
                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_this_book_map:
+                free_cpumask_var(d->this_book_map); /* fall through */
        case sa_this_core_map:
                free_cpumask_var(d->this_core_map); /* fall through */
        case sa_this_sibling_map:
@@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                return sa_nodemask;
        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                return sa_this_sibling_map;
-        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+        if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                return sa_this_core_map;
+        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+                return sa_this_book_map;
        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                return sa_send_covered;
        d->rd = alloc_rootdomain();
@@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
        return sd;
 }
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        struct sched_domain *parent, int i)
+{
+        struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+        sd = &per_cpu(book_domains, i).sd;
+        SD_INIT(sd, BOOK);
+        set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+        sd->parent = parent;
+        parent->child = sd;
+        cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+        return sd;
+}
 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
        struct sched_domain *parent, int i)
@@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
                                                d->send_covered, d->tmpmask);
                break;
 #endif
+#ifdef CONFIG_SCHED_BOOK
+        case SD_LV_BOOK: /* set up book groups */
+                cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+                if (cpu == cpumask_first(d->this_book_map))
+                        init_sched_build_groups(d->this_book_map, cpu_map,
+                                                &cpu_to_book_group,
+                                                d->send_covered, d->tmpmask);
+                break;
+#endif
        case SD_LV_CPU: /* set up physical groups */
                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
                if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+                sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
        }
        for_each_cpu(i, cpu_map) {
                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+                build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
        }
@@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                init_sched_groups_power(i, sd);
        }
 #endif
+#ifdef CONFIG_SCHED_BOOK
+        for_each_cpu(i, cpu_map) {
+                sd = &per_cpu(book_domains, i).sd;
+                init_sched_groups_power(i, sd);
+        }
+#endif
        for_each_cpu(i, cpu_map) {
                sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
                sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+                sd = &per_cpu(book_domains, i).sd;
 #else
                sd = &per_cpu(phys_domains, i).sd;
 #endif
@@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(cfs_rq);
- err:
+err:
        return 0;
 }
@@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(rt_rq);
- err:
+err:
        return 0;
 }
@@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49d..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
 /*
 * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 /*
 * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
-        u64 now = rq_of(cfs_rq)->clock;
+        u64 now = rq_of(cfs_rq)->clock_task;
        unsigned long delta_exec;
        if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-        se->exec_start = rq_of(cfs_rq)->clock;
+        se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 /**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
        check_preempt_curr(this_rq, p, 0);
+        /* re-arm NEWIDLE balancing when moving tasks */
+        src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+        this_rq->idle_stamp = 0;
 }
 /*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
+        tsk_cache_hot = task_hot(p, rq->clock_task, sd);
        if (!tsk_cache_hot ||
                sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
        unsigned long this_load;
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
+        unsigned long this_has_capacity;
        /* Statistics of the busiest group */
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
+        unsigned long busiest_has_capacity;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
        int group_imb; /* Is there an imbalance in the group ? */
+        int group_has_capacity; /* Is there extra capacity in the group? */
 };
 /**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
        u64 total, available;
        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
+        if (unlikely(total < rq->rt_avg)) {
+                /* Ensures that power won't end up being negative */
+                available = 0;
+        } else {
+                available = total - rq->rt_avg;
+        }
        if (unlikely((s64)total < SCHED_LOAD_SCALE))
                total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
        int i;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
+        max_nr_running = 0;
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
+                        if (load > max_cpu_load) {
                                max_cpu_load = load;
+                                max_nr_running = rq->nr_running;
+                        }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
                }
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
-        sgs->group_capacity =
+        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        if (sgs->group_capacity > sgs->sum_nr_running)
+                sgs->group_has_capacity = 1;
 }
 /**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                /*
                 * In case the child domain prefers tasks go to siblings
                 * first, lower the sg capacity to one so that we'll try
-                 * and move all the excess tasks away.
+                 * and move all the excess tasks away. We lower the capacity
+                 * of a group only if the local group has the capacity to fit
+                 * these excess tasks, i.e. nr_running < group_capacity. The
+                 * extra check prevents the case where you always pull from the
+                 * heaviest group when it is already under-utilized (possible
+                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling)
+                if (prefer_sibling && !local_group && sds->this_has_capacity)
                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
                if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this = sg;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
+                        sds->this_has_capacity = sgs.group_has_capacity;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->busiest_has_capacity = sgs.group_has_capacity;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                return fix_small_imbalance(sds, this_cpu, imbalance);
 }
 /******* find_busiest_group() helpers end here *********************/
 /**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * 4) This group is more busy than the avg busieness at this
         *    sched_domain.
         * 5) The imbalance is within the specified limit.
+         *
+         * Note: when doing newidle balance, if the local group has excess
+         * capacity (i.e. nr_running < group_capacity) and the busiest group
+         * does not have any capacity, we force a load balance to pull tasks
+         * to the local group. In this case, we skip past checks 3, 4 and 5.
         */
        if (!(*balance))
                goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
+        /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                        !sds.busiest_has_capacity)
+                goto force_balance;
        if (sds.this_load >= sds.max_load)
                goto out_balanced;
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                goto out_balanced;
+force_balance:
        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(&sds, this_cpu, imbalance);
        return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
+                /*
+                 * Increment the failure counter only on periodic balance.
+                 * We do not want newidle balance, which can be very
+                 * frequent, pollute the failure counter causing
+                 * excessive cache_hot migrations and active balances.
+                 */
+                if (idle != CPU_NEWLY_IDLE)
+                        sd->nr_balance_failed++;
                if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
                                        this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
+                if (pulled_task)
-                        this_rq->idle_stamp = 0;
                        break;
-                }
        }
        raw_spin_lock(&this_rq->lock);
@@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
        update_rq_clock(rq);
-        if (unlikely(task_cpu(p) != this_cpu))
+        if (unlikely(task_cpu(p) != this_cpu)) {
+                rcu_read_lock();
                __set_task_cpu(p, this_cpu);
+                rcu_read_unlock();
+        }
        update_curr(cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
 * release the lock. Decreases scheduling overhead.
 */
 SCHED_FEAT(OWNER_SPIN, 1)
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
        if (!task_has_rt_policy(curr))
                return;
-        delta_exec = rq->clock - curr->se.exec_start;
+        delta_exec = rq->clock_task - curr->se.exec_start;
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock;
+        curr->se.exec_start = rq->clock_task;
        cpuacct_charge(curr, delta_exec);
        sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
         * runqueue. Otherwise simply start this RT task
         * on its current runqueue.
         *
-         * We want to avoid overloading runqueues. Even if
+         * We want to avoid overloading runqueues. If the woken
-         * the RT task is of higher priority than the current RT task.
+         * task is a higher priority, then it will stay on this CPU
-         * RT tasks behave differently than other tasks. If
+         * and the lower prio task should be moved to another CPU.
-         * one gets preempted, we try to push it off to another queue.
+         * Even though this will probably make the lower prio task
-         * So trying to keep a preempting RT task on the same
+         * lose its cache, we do not want to bounce a higher task
-         * cache hot CPU will force the running RT task to
+         * around just because it gave up its CPU, perhaps for a
-         * a cold CPU. So we waste all the cache for the lower
+         * lock?
-         * RT task in hopes of saving some of a RT task
+         *
-         * that is just being woken and probably will have
+         * For equal prio tasks, we just let the scheduler sort it out.
-         * cold cache anyway.
         */
        if (unlikely(rt_task(rq->curr)) &&
+            (rq->curr->rt.nr_cpus_allowed < 2 ||
+             rq->curr->prio < p->prio) &&
            (p->rt.nr_cpus_allowed > 1)) {
                int cpu = find_lowest_rq(p);
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        } while (rt_rq);
        p = rt_task_of(rt_se);
-        p->se.exec_start = rq->clock;
+        p->se.exec_start = rq->clock_task;
        return p;
 }
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
        for_each_leaf_rt_rq(rt_rq, rq) {
                array = &rt_rq->active;
                idx = sched_find_first_bit(array->bitmap);
- next_idx:
+next_idx:
                if (idx >= MAX_RT_PRIO)
                        continue;
                if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
        if (!next_task)
                return 0;
- retry:
+retry:
        if (unlikely(next_task == rq->curr)) {
                WARN_ON(1);
                return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
                         * but possible)
                         */
                }
- skip:
+skip:
                double_unlock_balance(this_rq, src_rq);
        }
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1)
+            p->rt.nr_cpus_allowed > 1 &&
+            rt_task(rq->curr) &&
+            (rq->curr->rt.nr_cpus_allowed < 2 ||
+             rq->curr->prio < p->prio))
                push_rt_tasks(rq);
 }
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
-        p->se.exec_start = rq->clock;
+        p->se.exec_start = rq->clock_task;
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct rq *rq, struct task_struct *p,
+                    int sd_flag, int flags)
+{
+        return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+        resched_task(rq->curr); /* we preempt everything */
+}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+        struct task_struct *stop = rq->stop;
+        if (stop && stop->state == TASK_RUNNING)
+                return stop;
+        return NULL;
+}
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void yield_task_stop(struct rq *rq)
+{
+        BUG(); /* the stop task should never yield, its pointless. */
+}
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+static void switched_to_stop(struct rq *rq, struct task_struct *p,
+                             int running)
+{
+        BUG(); /* its impossible to change to this class */
+}
+static void prio_changed_stop(struct rq *rq, struct task_struct *p,
+                              int oldprio, int running)
+{
+        BUG(); /* how!?, what priority? */
+}
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+        return 0;
+}
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+static const struct sched_class stop_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_stop,
+        .dequeue_task           = dequeue_task_stop,
+        .yield_task             = yield_task_stop,
+        .check_preempt_curr     = check_preempt_curr_stop,
+        .pick_next_task         = pick_next_task_stop,
+        .put_prev_task          = put_prev_task_stop,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_stop,
+#endif
+        .set_curr_task          = set_curr_task_stop,
+        .task_tick              = task_tick_stop,
+        .get_rr_interval        = get_rr_interval_stop,
+        .prio_changed           = prio_changed_stop,
+        .switched_to            = switched_to_stop,
+        /* no .task_new for stop tasks */
+};
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..e33fd71ed66a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
 }
 /*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ *   softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ *   on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+/*
 * This one is for softirq.c-internal use,
 * where hardirqs are disabled legitimately:
 */
 #ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
        unsigned long flags;
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
         * We must manually increment preempt_count here and manually
         * call the trace_preempt_off later.
         */
-        preempt_count() += SOFTIRQ_OFFSET;
+        preempt_count() += cnt;
        /*
         * Were softirqs turned off above:
         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
+        if (softirq_count() == cnt)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
-        if (preempt_count() == SOFTIRQ_OFFSET)
+        if (preempt_count() == cnt)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-        add_preempt_count(SOFTIRQ_OFFSET);
+        add_preempt_count(cnt);
        barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
 void local_bh_disable(void)
 {
-        __local_bh_disable((unsigned long)__builtin_return_address(0));
+        __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(local_bh_disable);
+static void __local_bh_enable(unsigned int cnt)
+{
+        WARN_ON_ONCE(in_irq());
+        WARN_ON_ONCE(!irqs_disabled());
+        if (softirq_count() == cnt)
+                trace_softirqs_on((unsigned long)__builtin_return_address(0));
+        sub_preempt_count(cnt);
+}
 /*
 * Special-case - softirqs can safely be enabled in
 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
 */
 void _local_bh_enable(void)
 {
-        WARN_ON_ONCE(in_irq());
+        __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
-        WARN_ON_ONCE(!irqs_disabled());
-        if (softirq_count() == SOFTIRQ_OFFSET)
-                trace_softirqs_on((unsigned long)__builtin_return_address(0));
-        sub_preempt_count(SOFTIRQ_OFFSET);
 }
 EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
        /*
         * Are softirqs going to be turned on now:
         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
+        if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
                trace_softirqs_on(ip);
        /*
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        sub_preempt_count(SOFTIRQ_OFFSET - 1);
+        sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
        if (unlikely(!in_interrupt() && local_softirq_pending()))
                do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
        pending = local_softirq_pending();
        account_system_vtime(current);
-        __local_bh_disable((unsigned long)__builtin_return_address(0));
+        __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
        lockdep_softirq_enter();
        cpu = smp_processor_id();
@@ -212,18 +229,20 @@ restart:
        do {
                if (pending & 1) {
+                        unsigned int vec_nr = h - softirq_vec;
                        int prev_count = preempt_count();
-                        kstat_incr_softirqs_this_cpu(h - softirq_vec);
-                        trace_softirq_entry(h, softirq_vec);
+                        kstat_incr_softirqs_this_cpu(vec_nr);
+                        trace_softirq_entry(vec_nr);
                        h->action(h);
-                        trace_softirq_exit(h, softirq_vec);
+                        trace_softirq_exit(vec_nr);
                        if (unlikely(prev_count != preempt_count())) {
-                                printk(KERN_ERR "huh, entered softirq %td %s %p"
+                                printk(KERN_ERR "huh, entered softirq %u %s %p"
                                       "with preempt_count %08x,"
-                                       " exited with %08x?\n", h - softirq_vec,
+                                       " exited with %08x?\n", vec_nr,
-                                       softirq_to_name[h - softirq_vec],
+                                       softirq_to_name[vec_nr], h->action,
-                                       h->action, prev_count, preempt_count());
+                                       prev_count, preempt_count());
                                preempt_count() = prev_count;
                        }
@@ -245,7 +264,7 @@ restart:
        lockdep_softirq_exit();
        account_system_vtime(current);
-        _local_bh_enable();
+        __local_bh_enable(SOFTIRQ_OFFSET);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +298,16 @@ void irq_enter(void)
        rcu_irq_enter();
        if (idle_cpu(cpu) && !in_interrupt()) {
-                __irq_enter();
+                /*
+                 * Prevent raise_softirq from needlessly waking up ksoftirqd
+                 * here, as softirq will be serviced on return from interrupt.
+                 */
+                local_bh_disable();
                tick_check_idle(cpu);
-        } else
+                _local_bh_enable();
-                __irq_enter();
+        }
+        __irq_enter();
 }
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +721,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
        set_current_state(TASK_INTERRUPTIBLE);
+        current->flags |= PF_KSOFTIRQD;
        while (!kthread_should_stop()) {
                preempt_disable();
                if (!local_softirq_pending()) {
@@ -886,17 +912,14 @@ int __init __weak early_irq_init(void)
        return 0;
 }
+#ifdef CONFIG_GENERIC_HARDIRQS
 int __init __weak arch_probe_nr_irqs(void)
 {
-        return 0;
+        return NR_IRQS_LEGACY;
 }
 int __init __weak arch_early_irq_init(void)
 {
        return 0;
 }
+#endif
-int __weak arch_init_chip_data(struct irq_desc *desc, int node)
-{
-        return 0;
-}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 int __init_srcu_struct(struct srcu_struct *sp, const char *name,
                       struct lock_class_key *key)
 {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* Don't re-initialize a lock while it is held. */
        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
        lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
        return init_srcu_struct_fields(sp);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..090c28812ce1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
        goto repeat;
 }
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
 static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                                           unsigned long action, void *hcpu)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                                   cpu);
                if (IS_ERR(p))
                        return NOTIFY_BAD;
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
                get_task_struct(p);
+                kthread_bind(p, cpu);
+                sched_set_stop_task(cpu, p);
                stopper->thread = p;
                break;
        case CPU_ONLINE:
-                kthread_bind(stopper->thread, cpu);
                /* strictly unnecessary, as first user will wake it */
                wake_up_process(stopper->thread);
                /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
        {
                struct cpu_stop_work *work;
+                sched_set_stop_task(cpu, NULL);
                /* kill the stopper */
                kthread_stop(stopper->thread);
                /* drain remaining works */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bad369ec5403..c782fe9924c7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
 cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recv);
 cond_syscall(compat_sys_recvfrom);
 cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
        int ret;
        struct kprobe *kps[2] = {&kp, &kp2};
-        kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        kp.addr = NULL;
+        kp.flags = 0;
        ret = register_kprobes(kps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
        int ret;
        struct jprobe *jps[2] = {&jp, &jp2};
-        jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        jp.kp.addr = NULL;
+        jp.kp.flags = 0;
        ret = register_jprobes(jps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
        int ret;
        struct kretprobe *rps[2] = {&rp, &rp2};
-        rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        rp.kp.addr = NULL;
+        rp.kp.flags = 0;
        ret = register_kretprobes(rps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..d2321891538f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
        time_reftime = get_seconds();
        offset64    = offset;
-        freq_adj    = (offset64 * secs) <<
+        freq_adj    = ntp_update_offset_fll(offset64, secs);
-                        (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
-        freq_adj    += ntp_update_offset_fll(offset64, secs);
+        /*
+         * Clamp update interval to reduce PLL gain with low
+         * sampling rate (e.g. intermittent network connection)
+         * to avoid instability.
+         */
+        if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+                secs = 1 << (SHIFT_PLL + 1 + time_constant);
+        freq_adj    += (offset64 * secs) <<
+                        (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
        freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
        printk_tick();
-        perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+        if (in_irq())
+                irq_work_run();
+#endif
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..e04b8bcdef88 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
        help
          See Documentation/trace/ftrace-design.txt
+config HAVE_C_RECORDMCOUNT
+        bool
+        help
+          C version of recordmcount available?
 config TRACER_MAX_TRACE
        bool
@@ -121,7 +126,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER
+        select FRAME_POINTER if (!ARM_UNWIND)
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..ebd80d50c474 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
        FTRACE_ENABLE_CALLS             = (1 << 0),
        FTRACE_DISABLE_CALLS            = (1 << 1),
        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
-        FTRACE_ENABLE_MCOUNT            = (1 << 3),
+        FTRACE_START_FUNC_RET           = (1 << 3),
-        FTRACE_DISABLE_MCOUNT           = (1 << 4),
+        FTRACE_STOP_FUNC_RET            = (1 << 4),
-        FTRACE_START_FUNC_RET           = (1 << 5),
-        FTRACE_STOP_FUNC_RET            = (1 << 6),
 };
 static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
 static void ftrace_startup_sysctl(void)
 {
-        int command = FTRACE_ENABLE_MCOUNT;
        if (unlikely(ftrace_disabled))
                return;
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
        saved_ftrace_func = NULL;
        /* ftrace_start_up is true if we want ftrace running */
        if (ftrace_start_up)
-                command |= FTRACE_ENABLE_CALLS;
+                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-        ftrace_run_update_code(command);
 }
 static void ftrace_shutdown_sysctl(void)
 {
-        int command = FTRACE_DISABLE_MCOUNT;
        if (unlikely(ftrace_disabled))
                return;
        /* ftrace_start_up is true if ftrace is running */
        if (ftrace_start_up)
-                command |= FTRACE_DISABLE_CALLS;
+                ftrace_run_update_code(FTRACE_DISABLE_CALLS);
-        ftrace_run_update_code(command);
 }
 static cycle_t          ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 struct ftrace_iterator {
-        struct ftrace_page      *pg;
+        loff_t                          pos;
-        int                     hidx;
+        loff_t                          func_pos;
-        int                     idx;
+        struct ftrace_page              *pg;
-        unsigned                flags;
+        struct dyn_ftrace               *func;
-        struct trace_parser     parser;
+        struct ftrace_func_probe        *probe;
+        struct trace_parser             parser;
+        int                             hidx;
+        int                             idx;
+        unsigned                        flags;
 };
 static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct hlist_node *hnd = v;
+        struct hlist_node *hnd = NULL;
        struct hlist_head *hhd;
-        WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
        (*pos)++;
+        iter->pos = *pos;
+        if (iter->probe)
+                hnd = &iter->probe->node;
 retry:
        if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
                return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
                }
        }
-        return hnd;
+        if (WARN_ON_ONCE(!hnd))
+                return NULL;
+        iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+        return iter;
 }
 static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
        void *p = NULL;
        loff_t l;
-        if (!(iter->flags & FTRACE_ITER_HASH))
+        if (iter->func_pos > *pos)
-                *pos = 0;
+                return NULL;
-        iter->flags |= FTRACE_ITER_HASH;
        iter->hidx = 0;
-        for (l = 0; l <= *pos; ) {
+        for (l = 0; l <= (*pos - iter->func_pos); ) {
-                p = t_hash_next(m, p, &l);
+                p = t_hash_next(m, &l);
                if (!p)
                        break;
        }
-        return p;
+        if (!p)
+                return NULL;
+        /* Only set this if we have an item */
+        iter->flags |= FTRACE_ITER_HASH;
+        return iter;
 }
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
        struct ftrace_func_probe *rec;
-        struct hlist_node *hnd = v;
-        rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+        rec = iter->probe;
+        if (WARN_ON_ONCE(!rec))
+                return -EIO;
        if (rec->ops->print)
                return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        struct dyn_ftrace *rec = NULL;
        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_next(m, v, pos);
+                return t_hash_next(m, pos);
        (*pos)++;
+        iter->pos = *pos;
        if (iter->flags & FTRACE_ITER_PRINTALL)
-                return NULL;
+                return t_hash_start(m, pos);
 retry:
        if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                }
        }
-        return rec;
+        if (!rec)
+                return t_hash_start(m, pos);
+        iter->func_pos = *pos;
+        iter->func = rec;
+        return iter;
+}
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+        iter->pos = 0;
+        iter->func_pos = 0;
+        iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
        mutex_lock(&ftrace_lock);
        /*
+         * If an lseek was done, then reset and start from beginning.
+         */
+        if (*pos < iter->pos)
+                reset_iter_read(iter);
+        /*
         * For set_ftrace_filter reading, if we have the filter
         * off, we can short cut and just print out that all
         * functions are enabled.
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_start(m, pos);
+        /*
+         * Unfortunately, we need to restart at ftrace_pages_start
+         * every time we let go of the ftrace_mutex. This is because
+         * those pointers can change without the lock.
+         */
        iter->pg = ftrace_pages_start;
        iter->idx = 0;
        for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                        break;
        }
-        if (!p && iter->flags & FTRACE_ITER_FILTER)
+        if (!p) {
-                return t_hash_start(m, pos);
+                if (iter->flags & FTRACE_ITER_FILTER)
+                        return t_hash_start(m, pos);
-        return p;
+                return NULL;
+        }
+        return iter;
 }
 static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
 static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_iterator *iter = m->private;
-        struct dyn_ftrace *rec = v;
+        struct dyn_ftrace *rec;
        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_show(m, v);
+                return t_hash_show(m, iter);
        if (iter->flags & FTRACE_ITER_PRINTALL) {
                seq_printf(m, "#### all functions enabled ####\n");
                return 0;
        }
+        rec = iter->func;
        if (!rec)
                return 0;
@@ -1601,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
        ret = ftrace_avail_open(inode, file);
        if (!ret) {
-                m = (struct seq_file *)file->private_data;
+                m = file->private_data;
-                iter = (struct ftrace_iterator *)m->private;
+                iter = m->private;
                iter->flags = FTRACE_ITER_FAILURES;
        }
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
        .open = ftrace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = no_llseek,
+        .llseek = ftrace_regex_lseek,
        .release = ftrace_filter_release,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ad25490f8b40..ec5c71005c14 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2615,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        return local_read(&cpu_buffer->entries) -
+                (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
 /**
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
 * @buffer: The ring buffer
@@ -2623,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned long ret;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 0;
        cpu_buffer = buffer->buffers[cpu];
-        ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-                - cpu_buffer->read;
-        return ret;
+        return rb_num_of_entries(cpu_buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
@@ -2693,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-                entries += (local_read(&cpu_buffer->entries) -
+                entries += rb_num_of_entries(cpu_buffer);
-                            local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
        }
        return entries;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 static int tracing_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
        int cpu;
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu();
        struct dentry *d_cpu;
-        /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+        char cpu_dir[30]; /* 30 characters should be more than enough */
-        char cpu_dir[7];
-        if (cpu > 999 || cpu < 0)
+        snprintf(cpu_dir, 30, "cpu%ld", cpu);
-                return;
-        sprintf(cpu_dir, "cpu%ld", cpu);
        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
                pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+                    unsigned long ip,
+                    unsigned long parent_ip,
+                    unsigned long flags, int pc);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf2..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
-static char *perf_trace_buf[4];
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 /*
 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int	total_ref_count;
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
-        struct hlist_head *list;
+        struct hlist_head __percpu *list;
        int ret = -ENOMEM;
        int cpu;
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
        tp_event->perf_events = list;
        if (!total_ref_count) {
-                char *buf;
+                char __percpu *buf;
                int i;
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-                        buf = (char *)alloc_percpu(perf_trace_t);
+                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
                        if (!buf)
                                goto fail;
@@ -65,7 +65,7 @@ fail:
        if (!total_ref_count) {
                int i;
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
        return ret;
 }
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
 {
        struct ftrace_event_call *tp_event = p_event->tp_event;
+        struct hlist_head __percpu *pcpu_list;
        struct hlist_head *list;
-        list = tp_event->perf_events;
+        pcpu_list = tp_event->perf_events;
-        if (WARN_ON_ONCE(!list))
+        if (WARN_ON_ONCE(!pcpu_list))
                return -EINVAL;
-        list = this_cpu_ptr(list);
+        if (!(flags & PERF_EF_START))
+                p_event->hw.state = PERF_HES_STOPPED;
+        list = this_cpu_ptr(pcpu_list);
        hlist_add_head_rcu(&p_event->hlist_entry, list);
        return 0;
 }
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
 {
        hlist_del_rcu(&p_event->hlist_entry);
 }
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
        tp_event->perf_events = NULL;
        if (!--total_ref_count) {
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
 enum {
        FORMAT_HEADER           = 1,
-        FORMAT_PRINTFMT         = 2,
+        FORMAT_FIELD_SEPERATOR  = 2,
+        FORMAT_PRINTFMT         = 3,
 };
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct ftrace_event_call *call = m->private;
        struct ftrace_event_field *field;
-        struct list_head *head;
+        struct list_head *common_head = &ftrace_common_fields;
+        struct list_head *head = trace_get_fields(call);
        (*pos)++;
        switch ((unsigned long)v) {
        case FORMAT_HEADER:
-                head = &ftrace_common_fields;
+                if (unlikely(list_empty(common_head)))
+                        return NULL;
+                field = list_entry(common_head->prev,
+                                   struct ftrace_event_field, link);
+                return field;
+        case FORMAT_FIELD_SEPERATOR:
                if (unlikely(list_empty(head)))
                        return NULL;
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
                return NULL;
        }
-        head = trace_get_fields(call);
-        /*
-         * To separate common fields from event fields, the
-         * LSB is set on the first event field. Clear it in case.
-         */
-        v = (void *)((unsigned long)v & ~1L);
        field = v;
-        /*
+        if (field->link.prev == common_head)
-         * If this is a common field, and at the end of the list, then
+                return (void *)FORMAT_FIELD_SEPERATOR;
-         * continue with main list.
+        else if (field->link.prev == head)
-         */
-        if (field->link.prev == &ftrace_common_fields) {
-                if (unlikely(list_empty(head)))
-                        return NULL;
-                field = list_entry(head->prev, struct ftrace_event_field, link);
-                /* Set the LSB to notify f_show to print an extra newline */
-                field = (struct ftrace_event_field *)
-                        ((unsigned long)field | 1);
-                return field;
-        }
-        /* If we are done tell f_show to print the format */
-        if (field->link.prev == head)
                return (void *)FORMAT_PRINTFMT;
        field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
                seq_printf(m, "format:\n");
                return 0;
+        case FORMAT_FIELD_SEPERATOR:
+                seq_putc(m, '\n');
+                return 0;
        case FORMAT_PRINTFMT:
                seq_printf(m, "\nprint fmt: %s\n",
                           call->print_fmt);
                return 0;
        }
-        /*
-         * To separate common fields from event fields, the
-         * LSB is set on the first event field. Clear it and
-         * print a newline if it is set.
-         */
-        if ((unsigned long)v & 1) {
-                seq_putc(m, '\n');
-                v = (void *)((unsigned long)v & ~1L);
-        }
        field = v;
        /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
 #include "trace.h"
 #include "trace_output.h"
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
 struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             depth_irq;
        int             ignore;
        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 struct fgraph_data {
-        struct fgraph_cpu_data          *cpu_data;
+        struct fgraph_cpu_data __percpu *cpu_data;
        /* Place to preserve last processed entry. */
        struct ftrace_graph_ent_entry   ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+#define TRACE_GRAPH_PRINT_IRQS          0x40
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
        { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
        /* Display absolute time of an entry */
        { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+        /* Display interrupts */
+        { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
        { } /* Empty entry */
 };
 static struct tracer_flags tracer_flags = {
        /* Don't display overruns and proc by default */
        .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-               TRACE_GRAPH_PRINT_DURATION,
+               TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
        .opts = trace_opts
 };
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
        return 1;
 }
+static inline int ftrace_graph_ignore_irqs(void)
+{
+        if (!ftrace_graph_skip_irqs)
+                return 0;
+        return in_irq();
+}
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
        struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                return 0;
        /* trace it when it is-nested-in or is a function enabled. */
-        if (!(trace->depth || ftrace_graph_addr(trace->func)))
+        if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+              ftrace_graph_ignore_irqs())
                return 0;
        local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                return trace_graph_entry(trace);
 }
+static void
+__trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long flags, int pc)
+{
+        u64 time = trace_clock_local();
+        struct ftrace_graph_ent ent = {
+                .func  = ip,
+                .depth = 0,
+        };
+        struct ftrace_graph_ret ret = {
+                .func     = ip,
+                .depth    = 0,
+                .calltime = time,
+                .rettime  = time,
+        };
+        __trace_graph_entry(tr, &ent, flags, pc);
+        __trace_graph_return(tr, &ret, flags, pc);
+}
+void
+trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long parent_ip,
+                unsigned long flags, int pc)
+{
+        __trace_graph_function(tr, ip, flags, pc);
+}
 void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
        /* Print nsecs (we don't want to exceed 7 numbers) */
        if (len < 7) {
-                snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
+                size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
-                         nsecs_rem);
+                snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
                ret = trace_seq_printf(s, ".%s", nsecs_str);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
        return 0;
 }
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just extered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+                unsigned long addr, int depth)
+{
+        int cpu = iter->cpu;
+        int *depth_irq;
+        struct fgraph_data *data = iter->private;
+        /*
+         * If we are either displaying irqs, or we got called as
+         * a graph event and private data does not exist,
+         * then we bypass the irq check.
+         */
+        if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+            (!data))
+                return 0;
+        depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+        /*
+         * We are inside the irq code
+         */
+        if (*depth_irq >= 0)
+                return 1;
+        if ((addr < (unsigned long)__irqentry_text_start) ||
+            (addr >= (unsigned long)__irqentry_text_end))
+                return 0;
+        /*
+         * We are entering irq code.
+         */
+        *depth_irq = depth;
+        return 1;
+}
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+        int cpu = iter->cpu;
+        int *depth_irq;
+        struct fgraph_data *data = iter->private;
+        /*
+         * If we are either displaying irqs, or we got called as
+         * a graph event and private data does not exist,
+         * then we bypass the irq check.
+         */
+        if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+            (!data))
+                return 0;
+        depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+        /*
+         * We are not inside the irq code.
+         */
+        if (*depth_irq == -1)
+                return 0;
+        /*
+         * We are inside the irq code, and this is returning entry.
+         * Let's not trace it and clear the entry depth, since
+         * we are out of irq code.
+         *
+         * This condition ensures that we 'leave the irq code' once
+         * we are out of the entry depth. Thus protecting us from
+         * the RETURN entry loss.
+         */
+        if (*depth_irq >= depth) {
+                *depth_irq = -1;
+                return 1;
+        }
+        /*
+         * We are inside the irq code, and this is not the entry.
+         */
+        return 1;
+}
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        static enum print_line_t ret;
        int cpu = iter->cpu;
+        if (check_irq_entry(iter, flags, call->func, call->depth))
+                return TRACE_TYPE_HANDLED;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        int ret;
        int i;
+        if (check_irq_return(iter, flags, trace->depth))
+                return TRACE_TYPE_HANDLED;
        if (data) {
                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-        return print_graph_function_flags(iter, tracer_flags.val);
+        return __print_graph_function_flags(iter, tracer_flags.val);
+}
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+                                             u32 flags)
+{
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        return __print_graph_function_flags(iter, flags);
 }
 static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
        print_graph_headers_flags(s, tracer_flags.val);
 }
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+        struct trace_iterator *iter = s->private;
+        if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+                /* print nothing if the buffers are empty */
+                if (trace_empty(iter))
+                        return;
+                print_trace_header(s, iter);
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        } else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        __print_graph_headers_flags(s, flags);
+}
 void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
                *pid = -1;
                *depth = 0;
                *ignore = 0;
+                *depth_irq = -1;
        }
        iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
        }
 }
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+        if (bit == TRACE_GRAPH_PRINT_IRQS)
+                ftrace_graph_skip_irqs = !set;
+        return 0;
+}
 static struct trace_event_functions graph_functions = {
        .trace          = print_graph_function_event,
 };
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
        .print_line     = print_graph_function,
        .print_header   = print_graph_headers,
        .flags          = &tracer_flags,
+        .set_flag       = func_graph_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_function_graph,
 #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp	unsigned long max_sequence;
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
 */
-static void
+static int func_prolog_dec(struct trace_array *tr,
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+                           struct trace_array_cpu **data,
+                           unsigned long *flags)
 {
-        struct trace_array *tr = irqsoff_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
        long disabled;
        int cpu;
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
         */
        cpu = raw_smp_processor_id();
        if (likely(!per_cpu(tracing_cpu, cpu)))
-                return;
+                return 0;
-        local_save_flags(flags);
+        local_save_flags(*flags);
        /* slight chance to get a false positive on tracing_cpu */
-        if (!irqs_disabled_flags(flags))
+        if (!irqs_disabled_flags(*flags))
-                return;
+                return 0;
-        data = tr->data[cpu];
+        *data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
+        disabled = atomic_inc_return(&(*data)->disabled);
        if (likely(disabled == 1))
-                trace_function(tr, ip, parent_ip, flags, preempt_count());
+                return 1;
+        atomic_dec(&(*data)->disabled);
+        return 0;
+}
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        if (!func_prolog_dec(tr, &data, &flags))
+                return;
+        trace_function(tr, ip, parent_ip, flags, preempt_count());
        atomic_dec(&data->disabled);
 }
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
        int ret;
-        int cpu;
        int pc;
-        cpu = raw_smp_processor_id();
+        if (!func_prolog_dec(tr, &data, &flags))
-        if (likely(!per_cpu(tracing_cpu, cpu)))
                return 0;
-        local_save_flags(flags);
+        pc = preempt_count();
-        /* slight chance to get a false positive on tracing_cpu */
+        ret = __trace_graph_entry(tr, trace, flags, pc);
-        if (!irqs_disabled_flags(flags))
-                return 0;
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1)) {
-                pc = preempt_count();
-                ret = __trace_graph_entry(tr, trace, flags, pc);
-        } else
-                ret = 0;
        atomic_dec(&data->disabled);
        return ret;
 }
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
-        int cpu;
        int pc;
-        cpu = raw_smp_processor_id();
+        if (!func_prolog_dec(tr, &data, &flags))
-        if (likely(!per_cpu(tracing_cpu, cpu)))
                return;
-        local_save_flags(flags);
+        pc = preempt_count();
-        /* slight chance to get a false positive on tracing_cpu */
+        __trace_graph_return(tr, trace, flags, pc);
-        if (!irqs_disabled_flags(flags))
-                return;
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1)) {
-                pc = preempt_count();
-                __trace_graph_return(tr, trace, flags, pc);
-        }
        atomic_dec(&data->disabled);
 }
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
-        u32 flags = GRAPH_TRACER_FLAGS;
-        if (trace_flags & TRACE_ITER_LATENCY_FMT)
-                flags |= TRACE_GRAPH_PRINT_DURATION;
-        else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
        /*
         * In graph mode call the graph tracer output function,
         * otherwise go with the TRACE_FN event handler
         */
        if (is_graph())
-                return print_graph_function_flags(iter, flags);
+                return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
        return TRACE_TYPE_UNHANDLED;
 }
 static void irqsoff_print_header(struct seq_file *s)
 {
-        if (is_graph()) {
+        if (is_graph())
-                struct trace_iterator *iter = s->private;
+                print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
-                u32 flags = GRAPH_TRACER_FLAGS;
+        else
-                if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-                        /* print nothing if the buffers are empty */
-                        if (trace_empty(iter))
-                                return;
-                        print_trace_header(s, iter);
-                        flags |= TRACE_GRAPH_PRINT_DURATION;
-                } else
-                        flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-                print_graph_headers_flags(s, flags);
-        } else
                trace_default_header(s);
 }
 static void
-trace_graph_function(struct trace_array *tr,
-                 unsigned long ip, unsigned long flags, int pc)
-{
-        u64 time = trace_clock_local();
-        struct ftrace_graph_ent ent = {
-                .func  = ip,
-                .depth = 0,
-        };
-        struct ftrace_graph_ret ret = {
-                .func     = ip,
-                .depth    = 0,
-                .calltime = time,
-                .rettime  = time,
-        };
-        __trace_graph_entry(tr, &ent, flags, pc);
-        __trace_graph_return(tr, &ret, flags, pc);
-}
-static void
 __trace_function(struct trace_array *tr,
                 unsigned long ip, unsigned long parent_ip,
                 unsigned long flags, int pc)
 {
-        if (!is_graph())
+        if (is_graph())
+                trace_graph_function(tr, ip, parent_ip, flags, pc);
+        else
                trace_function(tr, ip, parent_ip, flags, pc);
-        else {
-                trace_graph_function(tr, parent_ip, flags, pc);
-                trace_graph_function(tr, ip, flags, pc);
-        }
 }
 #else
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..b8d2852baa4a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -648,7 +648,7 @@ static int register_trace_probe(struct trace_probe *tp)
        }
        ret = register_probe_event(tp);
        if (ret) {
-                pr_warning("Faild to register probe event(%d)\n", ret);
+                pr_warning("Failed to register probe event(%d)\n", ret);
                goto end;
        }
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int			wakeup_rt;
 static arch_spinlock_t wakeup_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static void wakeup_reset(struct trace_array *tr);
 static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 static int save_lat_flag;
+#define TRACE_DISPLAY_GRAPH     1
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        /* display latency trace as call graph */
+        { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+        { } /* Empty entry */
+};
+static struct tracer_flags tracer_flags = {
+        .val  = 0,
+        .opts = trace_opts,
+};
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
 */
-static void
+static int
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+func_prolog_preempt_disable(struct trace_array *tr,
+                            struct trace_array_cpu **data,
+                            int *pc)
 {
-        struct trace_array *tr = wakeup_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
        long disabled;
        int cpu;
-        int pc;
        if (likely(!wakeup_task))
-                return;
+                return 0;
-        pc = preempt_count();
+        *pc = preempt_count();
        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
        if (cpu != wakeup_current_cpu)
                goto out_enable;
-        data = tr->data[cpu];
+        *data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
+        disabled = atomic_inc_return(&(*data)->disabled);
        if (unlikely(disabled != 1))
                goto out;
-        local_irq_save(flags);
+        return 1;
-        trace_function(tr, ip, parent_ip, flags, pc);
+out:
+        atomic_dec(&(*data)->disabled);
+out_enable:
+        preempt_enable_notrace();
+        return 0;
+}
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return;
+        local_irq_save(flags);
+        trace_function(tr, ip, parent_ip, flags, pc);
        local_irq_restore(flags);
- out:
        atomic_dec(&data->disabled);
- out_enable:
        preempt_enable_notrace();
 }
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
+static int start_func_tracer(int graph)
+{
+        int ret;
+        if (!graph)
+                ret = register_ftrace_function(&trace_ops);
+        else
+                ret = register_ftrace_graph(&wakeup_graph_return,
+                                            &wakeup_graph_entry);
+        if (!ret && tracing_is_enabled())
+                tracer_enabled = 1;
+        else
+                tracer_enabled = 0;
+        return ret;
+}
+static void stop_func_tracer(int graph)
+{
+        tracer_enabled = 0;
+        if (!graph)
+                unregister_ftrace_function(&trace_ops);
+        else
+                unregister_ftrace_graph();
+}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+        if (!(bit & TRACE_DISPLAY_GRAPH))
+                return -EINVAL;
+        if (!(is_graph() ^ set))
+                return 0;
+        stop_func_tracer(!set);
+        wakeup_reset(wakeup_trace);
+        tracing_max_latency = 0;
+        return start_func_tracer(set);
+}
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc, ret = 0;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return 0;
+        local_save_flags(flags);
+        ret = __trace_graph_entry(tr, trace, flags, pc);
+        atomic_dec(&data->disabled);
+        preempt_enable_notrace();
+        return ret;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return;
+        local_save_flags(flags);
+        __trace_graph_return(tr, trace, flags, pc);
+        atomic_dec(&data->disabled);
+        preempt_enable_notrace();
+        return;
+}
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+        if (is_graph())
+                graph_trace_open(iter);
+}
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+        if (iter->private)
+                graph_trace_close(iter);
+}
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+        /*
+         * In graph mode call the graph tracer output function,
+         * otherwise go with the TRACE_FN event handler
+         */
+        if (is_graph())
+                return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+        return TRACE_TYPE_UNHANDLED;
+}
+static void wakeup_print_header(struct seq_file *s)
+{
+        if (is_graph())
+                print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+        else
+                trace_default_header(s);
+}
+static void
+__trace_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long parent_ip,
+                 unsigned long flags, int pc)
+{
+        if (is_graph())
+                trace_graph_function(tr, ip, parent_ip, flags, pc);
+        else
+                trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+        return -EINVAL;
+}
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+        return -1;
+}
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+        return TRACE_TYPE_UNHANDLED;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
 * Should this new latency be reported/recorded?
 */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
        /* The task we are waiting for is waking up */
        data = wakeup_trace->data[wakeup_cpu];
-        trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+        __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
        T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
         * is not called by an assembly function  (where as schedule is)
         * it should be safe to use it here.
         */
-        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+        __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 out_locked:
        arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
         */
        smp_wmb();
-        register_ftrace_function(&trace_ops);
+        if (start_func_tracer(is_graph()))
+                printk(KERN_ERR "failed to start wakeup tracer\n");
-        if (tracing_is_enabled())
-                tracer_enabled = 1;
-        else
-                tracer_enabled = 0;
        return;
 fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
-        unregister_ftrace_function(&trace_ops);
+        stop_func_tracer(is_graph());
        unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
        unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
        .print_max      = 1,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
        .use_max_tr     = 1,
 };
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .stop           = wakeup_tracer_stop,
        .wait_pipe      = poll_wait_pipe,
        .print_max      = 1,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
        .use_max_tr     = 1,
 };
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
 {
        int ret, cpu;
+        for_each_possible_cpu(cpu) {
+                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+        }
        ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
        if (ret)
                goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
        if (ret)
                goto no_creation;
-        for_each_possible_cpu(cpu) {
-                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-        }
        return 0;
 no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..e95ee7f31d43 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/jump_label.h>
 extern struct tracepoint __start___tracepoints[];
 extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
         * is used.
         */
        rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-        elem->state = active;
+        if (!elem->state && active) {
+                jump_label_enable(&elem->state);
+                elem->state = active;
+        } else if (elem->state && !active) {
+                jump_label_disable(&elem->state);
+                elem->state = active;
+        }
 }
 /*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
        if (elem->unregfunc && elem->state)
                elem->unregfunc();
-        elem->state = 0;
+        if (elem->state) {
+                jump_label_disable(&elem->state);
+                elem->state = 0;
+        }
        rcu_assign_pointer(elem->funcs, NULL);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc1..bafba687a6d8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
-static int __read_mostly did_panic;
 static int __initdata no_watchdog;
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
        return 0;
 }
-static int
-watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-        did_panic = 1;
-        return NOTIFY_DONE;
-}
-static struct notifier_block panic_block = {
-        .notifier_call = watchdog_panic,
-};
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
@@ -209,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
 };
 /* Callback function for perf event subsystem */
-void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                 struct perf_sample_data *data,
                 struct pt_regs *regs)
 {
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        wd_attr = &wd_hw_attr;
        wd_attr->sample_period = hw_nmi_get_sample_period();
-        event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
        if (!IS_ERR(event)) {
                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
                goto out_save;
        }
        printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
-        return -1;
+        return PTR_ERR(event);
        /* success path */
 out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
 static int watchdog_enable(int cpu)
 {
        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+        int err;
        /* enable the perf event */
-        if (watchdog_nmi_enable(cpu) != 0)
+        err = watchdog_nmi_enable(cpu);
-                return -1;
+        if (err)
+                return err;
        /* create the watchdog thread */
        if (!p) {
                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                        return -1;
+                        return PTR_ERR(p);
                }
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
 {
        int cpu;
+        if (no_watchdog)
+                return;
        for_each_online_cpu(cpu)
                watchdog_disable(cpu);
@@ -526,17 +518,16 @@ static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
+        int err = 0;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (watchdog_prepare_cpu(hotcpu))
+                err = watchdog_prepare_cpu(hotcpu);
-                        return NOTIFY_BAD;
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                if (watchdog_enable(hotcpu))
+                err = watchdog_enable(hotcpu);
-                        return NOTIFY_BAD;
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
        }
-        return NOTIFY_OK;
+        return notifier_from_errno(err);
 }
 static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
                return 0;
        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        WARN_ON(err == NOTIFY_BAD);
+        WARN_ON(notifier_to_errno(err));
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
        return 0;
 }
 early_initcall(spawn_watchdog_task);