162 files changed, 15552 insertions, 10806 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..0b5ff083fa22 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,8 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o range.o
+            async.o range.o jump_label.o
-obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -85,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
 obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -99,8 +101,7 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        spin_unlock(&acct_lock);
        /* May block */
-        if (vfs_statfs(file->f_path.dentry, &sbuf))
+        if (vfs_statfs(&file->f_path, &sbuf))
                return res;
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
 */
 #include <linux/async.h>
-#include <linux/bug.h>
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 #include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
-#define MAX_THREADS     256
 #define MAX_WORK        32768
 static LIST_HEAD(async_pending);
 static LIST_HEAD(async_running);
 static DEFINE_SPINLOCK(async_lock);
-static int async_enabled = 0;
 struct async_entry {
-        struct list_head list;
+        struct list_head        list;
-        async_cookie_t   cookie;
+        struct work_struct      work;
-        async_func_ptr   *func;
+        async_cookie_t          cookie;
-        void             *data;
+        async_func_ptr          *func;
-        struct list_head *running;
+        void                    *data;
+        struct list_head        *running;
 };
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
-static DECLARE_WAIT_QUEUE_HEAD(async_new);
 static atomic_t entry_count;
-static atomic_t thread_count;
 extern int initcall_debug;
@@ -117,27 +110,23 @@ static async_cookie_t  lowest_in_progress(struct list_head *running)
        spin_unlock_irqrestore(&async_lock, flags);
        return ret;
 }
 /*
 * pick the first pending entry and run it
 */
-static void run_one_entry(void)
+static void async_run_entry_fn(struct work_struct *work)
 {
+        struct async_entry *entry =
+                container_of(work, struct async_entry, work);
        unsigned long flags;
-        struct async_entry *entry;
        ktime_t calltime, delta, rettime;
-        /* 1) pick one task from the pending queue */
+        /* 1) move self to the running queue */
        spin_lock_irqsave(&async_lock, flags);
-        if (list_empty(&async_pending))
-                goto out;
-        entry = list_first_entry(&async_pending, struct async_entry, list);
-        /* 2) move it to the running queue */
        list_move_tail(&entry->list, entry->running);
        spin_unlock_irqrestore(&async_lock, flags);
-        /* 3) run it (and print duration)*/
+        /* 2) run (and print duration) */
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk("calling  %lli_%pF @ %i\n", (long long)entry->cookie,
                        entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
                        (long long)ktime_to_ns(delta) >> 10);
        }
-        /* 4) remove it from the running queue */
+        /* 3) remove self from the running queue */
        spin_lock_irqsave(&async_lock, flags);
        list_del(&entry->list);
-        /* 5) free the entry  */
+        /* 4) free the entry */
        kfree(entry);
        atomic_dec(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
-        /* 6) wake up any waiters. */
+        /* 5) wake up any waiters */
        wake_up(&async_done);
-        return;
-out:
-        spin_unlock_irqrestore(&async_lock, flags);
 }
 static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
 {
        struct async_entry *entry;
        unsigned long flags;
        async_cookie_t newcookie;
-        
        /* allow irq-off callers */
        entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
         * If we're out of memory or if there's too much work
         * pending already, we execute synchronously.
         */
-        if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+        if (!entry || atomic_read(&entry_count) > MAX_WORK) {
                kfree(entry);
                spin_lock_irqsave(&async_lock, flags);
                newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
                ptr(data, newcookie);
                return newcookie;
        }
+        INIT_WORK(&entry->work, async_run_entry_fn);
        entry->func = ptr;
        entry->data = data;
        entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
        list_add_tail(&entry->list, &async_pending);
        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
-        wake_up(&async_new);
+        /* schedule for execution */
+        queue_work(system_unbound_wq, &entry->work);
        return newcookie;
 }
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
        async_synchronize_cookie_domain(cookie, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
-static int async_thread(void *unused)
-{
-        DECLARE_WAITQUEUE(wq, current);
-        add_wait_queue(&async_new, &wq);
-        while (!kthread_should_stop()) {
-                int ret = HZ;
-                set_current_state(TASK_INTERRUPTIBLE);
-                /*
-                 * check the list head without lock.. false positives
-                 * are dealt with inside run_one_entry() while holding
-                 * the lock.
-                 */
-                rmb();
-                if (!list_empty(&async_pending))
-                        run_one_entry();
-                else
-                        ret = schedule_timeout(HZ);
-                if (ret == 0) {
-                        /*
-                         * we timed out, this means we as thread are redundant.
-                         * we sign off and die, but we to avoid any races there
-                         * is a last-straw check to see if work snuck in.
-                         */
-                        atomic_dec(&thread_count);
-                        wmb(); /* manager must see our departure first */
-                        if (list_empty(&async_pending))
-                                break;
-                        /*
-                         * woops work came in between us timing out and us
-                         * signing off; we need to stay alive and keep working.
-                         */
-                        atomic_inc(&thread_count);
-                }
-        }
-        remove_wait_queue(&async_new, &wq);
-        return 0;
-}
-static int async_manager_thread(void *unused)
-{
-        DECLARE_WAITQUEUE(wq, current);
-        add_wait_queue(&async_new, &wq);
-        while (!kthread_should_stop()) {
-                int tc, ec;
-                set_current_state(TASK_INTERRUPTIBLE);
-                tc = atomic_read(&thread_count);
-                rmb();
-                ec = atomic_read(&entry_count);
-                while (tc < ec && tc < MAX_THREADS) {
-                        if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
-                                               tc))) {
-                                msleep(100);
-                                continue;
-                        }
-                        atomic_inc(&thread_count);
-                        tc++;
-                }
-                schedule();
-        }
-        remove_wait_queue(&async_new, &wq);
-        return 0;
-}
-static int __init async_init(void)
-{
-        async_enabled =
-                !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
-        WARN_ON(!async_enabled);
-        return 0;
-}
-core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
-#include <linux/inotify.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
                audit_hold_skb(skb);
        } else
                /* drop the extra reference if sent ok */
-                kfree_skb(skb);
+                consume_skb(skb);
 }
 static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
 /* audit watch functions */
-extern unsigned long audit_watch_inode(struct audit_watch *watch);
+#ifdef CONFIG_AUDIT_WATCH
-extern dev_t audit_watch_dev(struct audit_watch *watch);
 extern void audit_put_watch(struct audit_watch *watch);
 extern void audit_get_watch(struct audit_watch *watch);
 extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
-extern int audit_add_watch(struct audit_krule *krule);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
-extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
-extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
-extern void audit_inotify_unregister(struct list_head *in_list);
 extern char *audit_watch_path(struct audit_watch *watch);
-extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
-extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+#define audit_put_watch(w) {}
-                                           struct audit_watch *watch);
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
+#endif /* CONFIG_AUDIT_WATCH */
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
 #include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
 struct audit_chunk {
        struct list_head hash;
-        struct inotify_watch watch;
+        struct fsnotify_mark mark;
        struct list_head trees;         /* with root here */
        int dead;
        int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
 * tree is refcounted; one reference for "some rules on rules_list refer to
 * it", one for each chunk with pointer to it.
 *
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
 * of watch contributes 1 to .refs).
 *
 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
 * that makes a difference.  Some.
 */
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
 static struct audit_tree *alloc_tree(const char *s)
 {
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
        return tree->pathname;
 }
-static struct audit_chunk *alloc_chunk(int count)
-{
-        struct audit_chunk *chunk;
-        size_t size;
-        int i;
-        size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
-        chunk = kzalloc(size, GFP_KERNEL);
-        if (!chunk)
-                return NULL;
-        INIT_LIST_HEAD(&chunk->hash);
-        INIT_LIST_HEAD(&chunk->trees);
-        chunk->count = count;
-        atomic_long_set(&chunk->refs, 1);
-        for (i = 0; i < count; i++) {
-                INIT_LIST_HEAD(&chunk->owners[i].list);
-                chunk->owners[i].index = i;
-        }
-        inotify_init_watch(&chunk->watch);
-        return chunk;
-}
 static void free_chunk(struct audit_chunk *chunk)
 {
        int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
        audit_put_chunk(chunk);
 }
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+        struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+        call_rcu(&chunk->head, __put_chunk);
+}
+static struct audit_chunk *alloc_chunk(int count)
+{
+        struct audit_chunk *chunk;
+        size_t size;
+        int i;
+        size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+        chunk = kzalloc(size, GFP_KERNEL);
+        if (!chunk)
+                return NULL;
+        INIT_LIST_HEAD(&chunk->hash);
+        INIT_LIST_HEAD(&chunk->trees);
+        chunk->count = count;
+        atomic_long_set(&chunk->refs, 1);
+        for (i = 0; i < count; i++) {
+                INIT_LIST_HEAD(&chunk->owners[i].list);
+                chunk->owners[i].index = i;
+        }
+        fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+        return chunk;
+}
 enum {HASH_SIZE = 128};
 static struct list_head chunk_hash_heads[HASH_SIZE];
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
        return chunk_hash_heads + n % HASH_SIZE;
 }
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-        struct list_head *list = chunk_hash(chunk->watch.inode);
+        struct fsnotify_mark *entry = &chunk->mark;
+        struct list_head *list;
+        if (!entry->i.inode)
+                return;
+        list = chunk_hash(entry->i.inode);
        list_add_rcu(&chunk->hash, list);
 }
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
        struct audit_chunk *p;
        list_for_each_entry_rcu(p, list, hash) {
-                if (p->watch.inode == inode) {
+                /* mark.inode may have gone NULL, but who cares? */
+                if (p->mark.i.inode == inode) {
                        atomic_long_inc(&p->refs);
                        return p;
                }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
 static void untag_chunk(struct node *p)
 {
        struct audit_chunk *chunk = find_chunk(p);
+        struct fsnotify_mark *entry = &chunk->mark;
        struct audit_chunk *new;
        struct audit_tree *owner;
        int size = chunk->count - 1;
        int i, j;
-        if (!pin_inotify_watch(&chunk->watch)) {
+        fsnotify_get_mark(entry);
-                /*
-                 * Filesystem is shutting down; all watches are getting
-                 * evicted, just take it off the node list for this
-                 * tree and let the eviction logics take care of the
-                 * rest.
-                 */
-                owner = p->owner;
-                if (owner->root == chunk) {
-                        list_del_init(&owner->same_root);
-                        owner->root = NULL;
-                }
-                list_del_init(&p->list);
-                p->owner = NULL;
-                put_tree(owner);
-                return;
-        }
        spin_unlock(&hash_lock);
-        /*
+        spin_lock(&entry->lock);
-         * pin_inotify_watch() succeeded, so the watch won't go away
+        if (chunk->dead || !entry->i.inode) {
-         * from under us.
+                spin_unlock(&entry->lock);
-         */
-        mutex_lock(&chunk->watch.inode->inotify_mutex);
-        if (chunk->dead) {
-                mutex_unlock(&chunk->watch.inode->inotify_mutex);
                goto out;
        }
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
                list_del_init(&p->list);
                list_del_rcu(&chunk->hash);
                spin_unlock(&hash_lock);
-                inotify_evict_watch(&chunk->watch);
+                spin_unlock(&entry->lock);
-                mutex_unlock(&chunk->watch.inode->inotify_mutex);
+                fsnotify_destroy_mark(entry);
-                put_inotify_watch(&chunk->watch);
+                fsnotify_put_mark(entry);
                goto out;
        }
        new = alloc_chunk(size);
        if (!new)
                goto Fallback;
-        if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
+        fsnotify_duplicate_mark(&new->mark, entry);
+        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
                free_chunk(new);
                goto Fallback;
        }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
        list_for_each_entry(owner, &new->trees, same_root)
                owner->root = new;
        spin_unlock(&hash_lock);
-        inotify_evict_watch(&chunk->watch);
+        spin_unlock(&entry->lock);
-        mutex_unlock(&chunk->watch.inode->inotify_mutex);
+        fsnotify_destroy_mark(entry);
-        put_inotify_watch(&chunk->watch);
+        fsnotify_put_mark(entry);
        goto out;
 Fallback:
@@ -314,31 +308,33 @@ Fallback:
        p->owner = NULL;
        put_tree(owner);
        spin_unlock(&hash_lock);
-        mutex_unlock(&chunk->watch.inode->inotify_mutex);
+        spin_unlock(&entry->lock);
 out:
-        unpin_inotify_watch(&chunk->watch);
+        fsnotify_put_mark(entry);
        spin_lock(&hash_lock);
 }
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
 {
+        struct fsnotify_mark *entry;
        struct audit_chunk *chunk = alloc_chunk(1);
        if (!chunk)
                return -ENOMEM;
-        if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
+        entry = &chunk->mark;
+        if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
                free_chunk(chunk);
                return -ENOSPC;
        }
-        mutex_lock(&inode->inotify_mutex);
+        spin_lock(&entry->lock);
        spin_lock(&hash_lock);
        if (tree->goner) {
                spin_unlock(&hash_lock);
                chunk->dead = 1;
-                inotify_evict_watch(&chunk->watch);
+                spin_unlock(&entry->lock);
-                mutex_unlock(&inode->inotify_mutex);
+                fsnotify_destroy_mark(entry);
-                put_inotify_watch(&chunk->watch);
+                fsnotify_put_mark(entry);
                return 0;
        }
        chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        }
        insert_hash(chunk);
        spin_unlock(&hash_lock);
-        mutex_unlock(&inode->inotify_mutex);
+        spin_unlock(&entry->lock);
        return 0;
 }
 /* the first tagged inode becomes root of tree */
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
-        struct inotify_watch *watch;
+        struct fsnotify_mark *old_entry, *chunk_entry;
        struct audit_tree *owner;
        struct audit_chunk *chunk, *old;
        struct node *p;
        int n;
-        if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+        old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+        if (!old_entry)
                return create_chunk(inode, tree);
-        old = container_of(watch, struct audit_chunk, watch);
+        old = container_of(old_entry, struct audit_chunk, mark);
        /* are we already there? */
        spin_lock(&hash_lock);
        for (n = 0; n < old->count; n++) {
                if (old->owners[n].owner == tree) {
                        spin_unlock(&hash_lock);
-                        put_inotify_watch(&old->watch);
+                        fsnotify_put_mark(old_entry);
                        return 0;
                }
        }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        chunk = alloc_chunk(old->count + 1);
        if (!chunk) {
-                put_inotify_watch(&old->watch);
+                fsnotify_put_mark(old_entry);
                return -ENOMEM;
        }
-        mutex_lock(&inode->inotify_mutex);
+        chunk_entry = &chunk->mark;
-        if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
-                mutex_unlock(&inode->inotify_mutex);
+        spin_lock(&old_entry->lock);
-                put_inotify_watch(&old->watch);
+        if (!old_entry->i.inode) {
+                /* old_entry is being shot, lets just lie */
+                spin_unlock(&old_entry->lock);
+                fsnotify_put_mark(old_entry);
                free_chunk(chunk);
+                return -ENOENT;
+        }
+        fsnotify_duplicate_mark(chunk_entry, old_entry);
+        if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+                spin_unlock(&old_entry->lock);
+                free_chunk(chunk);
+                fsnotify_put_mark(old_entry);
                return -ENOSPC;
        }
+        /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+        spin_lock(&chunk_entry->lock);
        spin_lock(&hash_lock);
+        /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
        if (tree->goner) {
                spin_unlock(&hash_lock);
                chunk->dead = 1;
-                inotify_evict_watch(&chunk->watch);
+                spin_unlock(&chunk_entry->lock);
-                mutex_unlock(&inode->inotify_mutex);
+                spin_unlock(&old_entry->lock);
-                put_inotify_watch(&old->watch);
-                put_inotify_watch(&chunk->watch);
+                fsnotify_destroy_mark(chunk_entry);
+                fsnotify_put_mark(chunk_entry);
+                fsnotify_put_mark(old_entry);
                return 0;
        }
        list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
                list_add(&tree->same_root, &chunk->trees);
        }
        spin_unlock(&hash_lock);
-        inotify_evict_watch(&old->watch);
+        spin_unlock(&chunk_entry->lock);
-        mutex_unlock(&inode->inotify_mutex);
+        spin_unlock(&old_entry->lock);
-        put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
+        fsnotify_destroy_mark(old_entry);
-        put_inotify_watch(&old->watch); /* and kill it */
+        fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+        fsnotify_put_mark(old_entry); /* and kill it */
        return 0;
 }
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
-                        struct inode *inode = find_chunk(node)->watch.inode;
+                        struct audit_chunk *chunk = find_chunk(node);
+                        /* this could be NULL if the watch is dieing else where... */
+                        struct inode *inode = chunk->mark.i.inode;
                        node->index |= 1U<<31;
                        if (iterate_mounts(compare_root, inode, root_mnt))
                                node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
 *  Here comes the stuff asynchronous to auditctl operations
 */
-/* inode->inotify_mutex is locked */
 static void evict_chunk(struct audit_chunk *chunk)
 {
        struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
        mutex_unlock(&audit_filter_mutex);
 }
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
+static int audit_tree_handle_event(struct fsnotify_group *group,
-                         u32 cookie, const char *dname, struct inode *inode)
+                                   struct fsnotify_mark *inode_mark,
+                                   struct fsnotify_mark *vfsmonut_mark,
+                                   struct fsnotify_event *event)
+{
+        BUG();
+        return -EOPNOTSUPP;
+}
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
 {
-        struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+        struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
-        if (mask & IN_IGNORED) {
+        evict_chunk(chunk);
-                evict_chunk(chunk);
+        fsnotify_put_mark(entry);
-                put_inotify_watch(watch);
-        }
 }
-static void destroy_watch(struct inotify_watch *watch)
+static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
+                                  struct fsnotify_mark *inode_mark,
+                                  struct fsnotify_mark *vfsmount_mark,
+                                  __u32 mask, void *data, int data_type)
 {
-        struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+        return false;
-        call_rcu(&chunk->head, __put_chunk);
 }
-static const struct inotify_operations rtree_inotify_ops = {
+static const struct fsnotify_ops audit_tree_ops = {
-        .handle_event   = handle_event,
+        .handle_event = audit_tree_handle_event,
-        .destroy_watch  = destroy_watch,
+        .should_send_event = audit_tree_send_event,
+        .free_group_priv = NULL,
+        .free_event_priv = NULL,
+        .freeing_mark = audit_tree_freeing_mark,
 };
 static int __init audit_tree_init(void)
 {
        int i;
-        rtree_ih = inotify_init(&rtree_inotify_ops);
+        audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
-        if (IS_ERR(rtree_ih))
+        if (IS_ERR(audit_tree_group))
-                audit_panic("cannot initialize inotify handle for rectree watches");
+                audit_panic("cannot initialize fsnotify group for rectree watches");
        for (i = 0; i < HASH_SIZE; i++)
                INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
 /*
 * Reference counting:
 *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
 *      event.  Each audit_watch holds a reference to its associated parent.
 *
 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
        unsigned long           ino;    /* associated inode number */
        struct audit_parent     *parent; /* associated parent */
        struct list_head        wlist;  /* entry in parent->watches list */
-        struct list_head        rules;  /* associated rules */
+        struct list_head        rules;  /* anchor for krule->rlist */
 };
 struct audit_parent {
-        struct list_head        ilist;  /* entry in inotify registration list */
+        struct list_head        watches; /* anchor for audit_watch->wlist */
-        struct list_head        watches; /* associated watches */
+        struct fsnotify_mark mark; /* fsnotify mark on the inode */
-        struct inotify_watch    wdata;  /* inotify watch data */
-        unsigned                flags;  /* status flags */
 };
-/* Inotify handle. */
+/* fsnotify handle. */
-struct inotify_handle *audit_ih;
+struct fsnotify_group *audit_watch_group;
-/*
+/* fsnotify events we care about. */
- * audit_parent status flags:
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- *
+                        FS_MOVE_SELF | FS_EVENT_ON_CHILD)
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID    0x001
-/* Inotify events we care about. */
+static void audit_free_parent(struct audit_parent *parent)
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+{
+        WARN_ON(!list_empty(&parent->watches));
+        kfree(parent);
+}
-static void audit_free_parent(struct inotify_watch *i_watch)
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
 {
        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
+        parent = container_of(entry, struct audit_parent, mark);
-        WARN_ON(!list_empty(&parent->watches));
+        audit_free_parent(parent);
-        kfree(parent);
+}
+static void audit_get_parent(struct audit_parent *parent)
+{
+        if (likely(parent))
+                fsnotify_get_mark(&parent->mark);
+}
+static void audit_put_parent(struct audit_parent *parent)
+{
+        if (likely(parent))
+                fsnotify_put_mark(&parent->mark);
+}
+/*
+ * Find and return the audit_parent on the given inode.  If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+        struct audit_parent *parent = NULL;
+        struct fsnotify_mark *entry;
+        entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+        if (entry)
+                parent = container_of(entry, struct audit_parent, mark);
+        return parent;
 }
 void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
 void audit_remove_watch(struct audit_watch *watch)
 {
        list_del(&watch->wlist);
-        put_inotify_watch(&watch->parent->wdata);
+        audit_put_parent(watch->parent);
        watch->parent = NULL;
        audit_put_watch(watch); /* match initial get */
 }
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
        return watch->path;
 }
-struct list_head *audit_watch_rules(struct audit_watch *watch)
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
-{
-        return &watch->rules;
-}
-unsigned long audit_watch_inode(struct audit_watch *watch)
 {
-        return watch->ino;
+        return (watch->ino != (unsigned long)-1) &&
-}
+                (watch->ino == ino) &&
+                (watch->dev == dev);
-dev_t audit_watch_dev(struct audit_watch *watch)
-{
-        return watch->dev;
 }
 /* Initialize a parent watch entry. */
 static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 {
+        struct inode *inode = ndp->path.dentry->d_inode;
        struct audit_parent *parent;
-        s32 wd;
+        int ret;
        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
        if (unlikely(!parent))
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&parent->watches);
-        parent->flags = 0;
+        fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
-        inotify_init_watch(&parent->wdata);
+        parent->mark.mask = AUDIT_FS_WATCH;
-        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+        ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
-        get_inotify_watch(&parent->wdata);
+        if (ret < 0) {
-        wd = inotify_add_watch(audit_ih, &parent->wdata,
+                audit_free_parent(parent);
-                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+                return ERR_PTR(ret);
-        if (wd < 0) {
-                audit_free_parent(&parent->wdata);
-                return ERR_PTR(wd);
        }
        return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
 {
        struct audit_watch *watch;
-        if (!audit_ih)
+        if (!audit_watch_group)
                return -EOPNOTSUPP;
        if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
        new->dev = old->dev;
        new->ino = old->ino;
-        get_inotify_watch(&old->parent->wdata);
+        audit_get_parent(old->parent);
        new->parent = old->parent;
 out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
        struct audit_entry *oentry, *nentry;
        mutex_lock(&audit_filter_mutex);
+        /* Run all of the watches on this parent looking for the one that
+         * matches the given dname */
        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
                if (audit_compare_dname_path(dname, owatch->path, NULL))
                        continue;
                /* If the update involves invalidating rules, do the inode-based
                 * filtering now, so we don't omit records. */
-                if (invalidating && current->audit_context)
+                if (invalidating && !audit_dummy_context())
                        audit_filter_inodes(current, current->audit_context);
+                /* updating ino will likely change which audit_hash_list we
+                 * are on so we need a new watch for the new list */
                nwatch = audit_dupe_watch(owatch);
                if (IS_ERR(nwatch)) {
                        mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
                        list_del(&oentry->rule.rlist);
                        list_del_rcu(&oentry->list);
-                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
+                        nentry = audit_dupe_rule(&oentry->rule);
                        if (IS_ERR(nentry)) {
                                list_del(&oentry->rule.list);
                                audit_panic("error updating watch, removing");
                        } else {
                                int h = audit_hash_ino((u32)ino);
+                                /*
+                                 * nentry->rule.watch == oentry->rule.watch so
+                                 * we must drop that reference and set it to our
+                                 * new watch.
+                                 */
+                                audit_put_watch(nentry->rule.watch);
+                                audit_get_watch(nwatch);
+                                nentry->rule.watch = nwatch;
                                list_add(&nentry->rule.rlist, &nwatch->rules);
                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
                                list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
        struct audit_entry *e;
        mutex_lock(&audit_filter_mutex);
-        parent->flags |= AUDIT_PARENT_INVALID;
        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
                        e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                audit_remove_watch(w);
        }
        mutex_unlock(&audit_filter_mutex);
-}
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-void audit_inotify_unregister(struct list_head *in_list)
-{
-        struct audit_parent *p, *n;
-        list_for_each_entry_safe(p, n, in_list, ilist) {
+        fsnotify_destroy_mark(&parent->mark);
-                list_del(&p->ilist);
-                inotify_rm_watch(audit_ih, &p->wdata);
-                /* the unpin matching the pin in audit_do_del_rule() */
-                unpin_inotify_watch(&p->wdata);
-        }
 }
 /* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
        }
 }
-/* Associate the given rule with an existing parent inotify_watch.
+/* Associate the given rule with an existing parent.
 * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
                                struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
        struct audit_watch *w, *watch = krule->watch;
        int watch_found = 0;
+        BUG_ON(!mutex_is_locked(&audit_filter_mutex));
        list_for_each_entry(w, &parent->watches, wlist) {
                if (strcmp(watch->path, w->path))
                        continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
        }
        if (!watch_found) {
-                get_inotify_watch(&parent->wdata);
+                audit_get_parent(parent);
                watch->parent = parent;
                list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
 /* Find a matching watch entry, or add this one.
 * Caller must hold audit_filter_mutex. */
-int audit_add_watch(struct audit_krule *krule)
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
        struct audit_watch *watch = krule->watch;
-        struct inotify_watch *i_watch;
        struct audit_parent *parent;
        struct nameidata *ndp = NULL, *ndw = NULL;
-        int ret = 0;
+        int h, ret = 0;
        mutex_unlock(&audit_filter_mutex);
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
                goto error;
        }
+        mutex_lock(&audit_filter_mutex);
        /* update watch filter fields */
        if (ndw) {
                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
                watch->ino = ndw->path.dentry->d_inode->i_ino;
        }
-        /* The audit_filter_mutex must not be held during inotify calls because
+        /* either find an old parent or attach a new one */
-         * we hold it during inotify event callback processing.  If an existing
+        parent = audit_find_parent(ndp->path.dentry->d_inode);
-         * inotify watch is found, inotify_find_watch() grabs a reference before
+        if (!parent) {
-         * returning.
-         */
-        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-                               &i_watch) < 0) {
                parent = audit_init_parent(ndp);
                if (IS_ERR(parent)) {
-                        /* caller expects mutex locked */
-                        mutex_lock(&audit_filter_mutex);
                        ret = PTR_ERR(parent);
                        goto error;
                }
-        } else
+        }
-                parent = container_of(i_watch, struct audit_parent, wdata);
-        mutex_lock(&audit_filter_mutex);
-        /* parent was moved before we took audit_filter_mutex */
+        audit_add_to_parent(krule, parent);
-        if (parent->flags & AUDIT_PARENT_INVALID)
-                ret = -ENOENT;
-        else
-                audit_add_to_parent(krule, parent);
-        /* match get in audit_init_parent or inotify_find_watch */
+        /* match get in audit_find_parent or audit_init_parent */
-        put_inotify_watch(&parent->wdata);
+        audit_put_parent(parent);
+        h = audit_hash_ino((u32)watch->ino);
+        *list = &audit_inode_hash[h];
 error:
        audit_put_nd(ndp, ndw);         /* NULL args OK */
        return ret;
 }
-void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+void audit_remove_watch_rule(struct audit_krule *krule)
 {
        struct audit_watch *watch = krule->watch;
        struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
                audit_remove_watch(watch);
                if (list_empty(&parent->watches)) {
-                        /* Put parent on the inotify un-registration
+                        audit_get_parent(parent);
-                         * list.  Grab a reference before releasing
+                        fsnotify_destroy_mark(&parent->mark);
-                         * audit_filter_mutex, to be released in
+                        audit_put_parent(parent);
-                         * audit_inotify_unregister().
-                         * If filesystem is going away, just leave
-                         * the sucker alone, eviction will take
-                         * care of it. */
-                        if (pin_inotify_watch(&parent->wdata))
-                                list_add(&parent->ilist, list);
                }
        }
 }
-/* Update watch data in audit rules based on inotify events. */
+static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+                                          struct fsnotify_mark *inode_mark,
-                         u32 cookie, const char *dname, struct inode *inode)
+                                          struct fsnotify_mark *vfsmount_mark,
+                                          __u32 mask, void *data, int data_type)
+{
+       return true;
+}
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+                                    struct fsnotify_mark *inode_mark,
+                                    struct fsnotify_mark *vfsmount_mark,
+                                    struct fsnotify_event *event)
 {
+        struct inode *inode;
+        __u32 mask = event->mask;
+        const char *dname = event->file_name;
        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
+        parent = container_of(inode_mark, struct audit_parent, mark);
-        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+        BUG_ON(group != audit_watch_group);
-                audit_update_watch(parent, dname, inode->i_sb->s_dev,
-                                   inode->i_ino, 0);
+        switch (event->data_type) {
-        else if (mask & (IN_DELETE|IN_MOVED_FROM))
+        case (FSNOTIFY_EVENT_PATH):
+                inode = event->path.dentry->d_inode;
+                break;
+        case (FSNOTIFY_EVENT_INODE):
+                inode = event->inode;
+                break;
+        default:
+                BUG();
+                inode = NULL;
+                break;
+        };
+        if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+                audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+        else if (mask & (FS_DELETE|FS_MOVED_FROM))
                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-        /* inotify automatically removes the watch and sends IN_IGNORED */
+        else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
-        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
-                audit_remove_parent_watches(parent);
-        /* inotify does not remove the watch, so remove it manually */
-        else if(mask & IN_MOVE_SELF) {
                audit_remove_parent_watches(parent);
-                inotify_remove_watch_locked(audit_ih, i_watch);
-        } else if (mask & IN_IGNORED)
+        return 0;
-                put_inotify_watch(i_watch);
 }
-static const struct inotify_operations audit_inotify_ops = {
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
-        .handle_event   = audit_handle_ievent,
+        .should_send_event =    audit_watch_should_send_event,
-        .destroy_watch  = audit_free_parent,
+        .handle_event =         audit_watch_handle_event,
+        .free_group_priv =      NULL,
+        .freeing_mark =         NULL,
+        .free_event_priv =      NULL,
 };
 static int __init audit_watch_init(void)
 {
-        audit_ih = inotify_init(&audit_inotify_ops);
+        audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
-        if (IS_ERR(audit_ih))
+        if (IS_ERR(audit_watch_group)) {
-                audit_panic("cannot initialize inotify handle");
+                audit_watch_group = NULL;
+                audit_panic("cannot create audit fsnotify group");
+        }
        return 0;
 }
-subsys_initcall(audit_watch_init);
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
 {
        int i;
        struct audit_krule *erule = &e->rule;
        /* some rules don't have associated watches */
        if (erule->watch)
                audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
 * rule with the new rule in the filterlist, then free the old rule.
 * The rlist element is undefined; list manipulations are handled apart from
 * the initial copy. */
-struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
-                                    struct audit_watch *watch)
 {
        u32 fcount = old->field_count;
        struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
        new->prio = old->prio;
        new->buflen = old->buflen;
        new->inode_f = old->inode_f;
-        new->watch = NULL;
        new->field_count = old->field_count;
        /*
         * note that we are OK with not refcounting here; audit_match_tree()
         * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
                }
        }
-        if (watch) {
+        if (old->watch) {
-                audit_get_watch(watch);
+                audit_get_watch(old->watch);
-                new->watch = watch;
+                new->watch = old->watch;
        }
        return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
-        int h, err;
+        int err;
 #ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
        if (watch) {
                /* audit_filter_mutex is dropped and re-taken during this call */
-                err = audit_add_watch(&entry->rule);
+                err = audit_add_watch(&entry->rule, &list);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        goto error;
                }
-                /* entry->rule.watch may have changed during audit_add_watch() */
-                watch = entry->rule.watch;
-                h = audit_hash_ino((u32)audit_watch_inode(watch));
-                list = &audit_inode_hash[h];
        }
        if (tree) {
                err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
-        LIST_HEAD(inotify_list);
        int ret = 0;
 #ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
        }
        if (e->rule.watch)
-                audit_remove_watch_rule(&e->rule, &inotify_list);
+                audit_remove_watch_rule(&e->rule);
        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
 #endif
        mutex_unlock(&audit_filter_mutex);
-        if (!list_empty(&inotify_list))
-                audit_inotify_unregister(&inotify_list);
 out:
        if (watch)
                audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
 {
        struct audit_entry *entry = container_of(r, struct audit_entry, rule);
        struct audit_entry *nentry;
-        struct audit_watch *watch;
-        struct audit_tree *tree;
        int err = 0;
        if (!security_audit_rule_known(r))
                return 0;
-        watch = r->watch;
+        nentry = audit_dupe_rule(r);
-        tree = r->tree;
-        nentry = audit_dupe_rule(r, watch);
        if (IS_ERR(nentry)) {
                /* save the first error encountered for the
                 * return value */
                err = PTR_ERR(nentry);
                audit_panic("error updating LSM filters");
-                if (watch)
+                if (r->watch)
                        list_del(&r->rlist);
                list_del_rcu(&entry->list);
                list_del(&r->list);
        } else {
-                if (watch) {
+                if (r->watch || r->tree)
-                        list_add(&nentry->rule.rlist, audit_watch_rules(watch));
-                        list_del(&r->rlist);
-                } else if (tree)
                        list_replace_init(&r->rlist, &nentry->rule.rlist);
                list_replace_rcu(&entry->list, &nentry->list);
                list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/inotify.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_WATCH:
-                        if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
+                        if (name)
-                                result = (name->dev == audit_watch_dev(rule->watch) &&
+                                result = audit_watch_compare(rule->watch, name->ino, name->dev);
-                                          name->ino == audit_watch_inode(rule->watch));
                        break;
                case AUDIT_DIR:
                        if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
        struct audit_tree_refs *p;
        struct audit_chunk *chunk;
        int count;
-        if (likely(list_empty(&inode->inotify_watches)))
+        if (likely(hlist_empty(&inode->i_fsnotify_marks)))
                return;
        context = current->audit_context;
        p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
        seq = read_seqbegin(&rename_lock);
        for(;;) {
                struct inode *inode = d->d_inode;
-                if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+                if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
                        struct audit_chunk *chunk;
                        chunk = audit_tree_lookup(inode);
                        if (chunk) {
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name)
        context->names[context->name_count].ino  = (unsigned long)-1;
        context->names[context->name_count].osid = 0;
        ++context->name_count;
-        if (!context->pwd.dentry) {
+        if (!context->pwd.dentry)
-                read_lock(&current->fs->lock);
+                get_fs_pwd(current->fs, &context->pwd);
-                context->pwd = current->fs->pwd;
-                path_get(&current->fs->pwd);
-                read_unlock(&current->fs->lock);
-        }
 }
 /* audit_putname - intercept a putname request
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b0a64b..5cf366965d0c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
-#include <linux/smp_lock.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -138,7 +137,7 @@ struct css_id {
         * is called after synchronize_rcu(). But for safe use, css_is_removed()
         * css_tryget() should be used for avoiding race.
         */
-        struct cgroup_subsys_state *css;
+        struct cgroup_subsys_state __rcu *css;
        /*
         * ID of this css.
         */
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
+static int clone_children(const struct cgroup *cgrp)
+{
+        return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
 /*
 * for_each_subsys() allows you to iterate on each subsystem attached to
 * an active hierarchy
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noprefix");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+        if (clone_children(&root->top_cgroup))
+                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
        mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
        unsigned long subsys_bits;
        unsigned long flags;
        char *release_agent;
+        bool clone_children;
        char *name;
        /* User explicitly requested empty subsystem */
        bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
 */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
-        char *token, *o = data ?: "all";
+        char *token, *o = data;
+        bool all_ss = false, one_ss = false;
        unsigned long mask = (unsigned long)-1;
        int i;
        bool module_pin_failed = false;
@@ -1082,30 +1091,37 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        while ((token = strsep(&o, ",")) != NULL) {
                if (!*token)
                        return -EINVAL;
-                if (!strcmp(token, "all")) {
+                if (!strcmp(token, "none")) {
-                        /* Add all non-disabled subsystems */
-                        opts->subsys_bits = 0;
-                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                                struct cgroup_subsys *ss = subsys[i];
-                                if (ss == NULL)
-                                        continue;
-                                if (!ss->disabled)
-                                        opts->subsys_bits |= 1ul << i;
-                        }
-                } else if (!strcmp(token, "none")) {
                        /* Explicitly have no subsystems */
                        opts->none = true;
-                } else if (!strcmp(token, "noprefix")) {
+                        continue;
+                }
+                if (!strcmp(token, "all")) {
+                        /* Mutually exclusive option 'all' + subsystem name */
+                        if (one_ss)
+                                return -EINVAL;
+                        all_ss = true;
+                        continue;
+                }
+                if (!strcmp(token, "noprefix")) {
                        set_bit(ROOT_NOPREFIX, &opts->flags);
-                } else if (!strncmp(token, "release_agent=", 14)) {
+                        continue;
+                }
+                if (!strcmp(token, "clone_children")) {
+                        opts->clone_children = true;
+                        continue;
+                }
+                if (!strncmp(token, "release_agent=", 14)) {
                        /* Specifying two release agents is forbidden */
                        if (opts->release_agent)
                                return -EINVAL;
                        opts->release_agent =
-                                kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
+                                kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
                        if (!opts->release_agent)
                                return -ENOMEM;
-                } else if (!strncmp(token, "name=", 5)) {
+                        continue;
+                }
+                if (!strncmp(token, "name=", 5)) {
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1123,24 +1139,48 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        if (opts->name)
                                return -EINVAL;
                        opts->name = kstrndup(name,
-                                              MAX_CGROUP_ROOT_NAMELEN,
+                                              MAX_CGROUP_ROOT_NAMELEN - 1,
                                              GFP_KERNEL);
                        if (!opts->name)
                                return -ENOMEM;
-                } else {
-                        struct cgroup_subsys *ss;
+                        continue;
-                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                }
-                                ss = subsys[i];
-                                if (ss == NULL)
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                                        continue;
+                        struct cgroup_subsys *ss = subsys[i];
-                                if (!strcmp(token, ss->name)) {
+                        if (ss == NULL)
-                                        if (!ss->disabled)
+                                continue;
-                                                set_bit(i, &opts->subsys_bits);
+                        if (strcmp(token, ss->name))
-                                        break;
+                                continue;
-                                }
+                        if (ss->disabled)
-                        }
+                                continue;
-                        if (i == CGROUP_SUBSYS_COUNT)
-                                return -ENOENT;
+                        /* Mutually exclusive option 'all' + subsystem name */
+                        if (all_ss)
+                                return -EINVAL;
+                        set_bit(i, &opts->subsys_bits);
+                        one_ss = true;
+                        break;
+                }
+                if (i == CGROUP_SUBSYS_COUNT)
+                        return -ENOENT;
+        }
+        /*
+         * If the 'all' option was specified select all the subsystems,
+         * otherwise 'all, 'none' and a subsystem name options were not
+         * specified, let's default to 'all'
+         */
+        if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss == NULL)
+                                continue;
+                        if (ss->disabled)
+                                continue;
+                        set_bit(i, &opts->subsys_bits);
                }
        }
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgroup_sb_opts opts;
-        lock_kernel();
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        kfree(opts.name);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-        unlock_kernel();
        return ret;
 }
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
                strcpy(root->name, opts->name);
+        if (opts->clone_children)
+                set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
        return root;
 }
@@ -1568,7 +1608,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
        return ret;
 }
@@ -1623,6 +1662,8 @@ static struct file_system_type cgroup_fs_type = {
        .kill_sb = cgroup_kill_sb,
 };
+static struct kobject *cgroup_kobj;
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
        return dentry->d_fsdata;
@@ -1788,6 +1829,30 @@ out:
        return retval;
 }
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+        struct cgroupfs_root *root;
+        int retval = 0;
+        cgroup_lock();
+        for_each_active_root(root) {
+                struct cgroup *from_cg = task_cgroup_from_root(from, root);
+                retval = cgroup_attach_task(from_cg, tsk);
+                if (retval)
+                        break;
+        }
+        cgroup_unlock();
+        return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
 * held. May take task_lock of task
@@ -1857,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        if (strlen(buffer) >= PATH_MAX)
+                return -EINVAL;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
        strcpy(cgrp->root->release_agent_path, buffer);
@@ -3150,6 +3217,23 @@ fail:
        return ret;
 }
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+                                    struct cftype *cft)
+{
+        return clone_children(cgrp);
+}
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+                                     struct cftype *cft,
+                                     u64 val)
+{
+        if (val)
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+        else
+                clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+        return 0;
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -3180,6 +3264,11 @@ static struct cftype files[] = {
                .write_string = cgroup_write_event_control,
                .mode = S_IWUGO,
        },
+        {
+                .name = "cgroup.clone_children",
+                .read_u64 = cgroup_clone_children_read,
+                .write_u64 = cgroup_clone_children_write,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -3309,6 +3398,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        if (clone_children(parent))
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
@@ -3323,6 +3415,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                                goto err_destroy;
                }
                /* At error, ->destroy() callback has to free assigned ID. */
+                if (clone_children(parent) && ss->post_clone)
+                        ss->post_clone(ss, cgrp);
        }
        cgroup_lock_hierarchy(root);
@@ -3871,9 +3965,18 @@ int __init cgroup_init(void)
        hhead = css_set_hash(init_css_set.subsys);
        hlist_add_head(&init_css_set.hlist, hhead);
        BUG_ON(!init_root_id(&rootnode));
+        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+        if (!cgroup_kobj) {
+                err = -ENOMEM;
+                goto out;
+        }
        err = register_filesystem(&cgroup_fs_type);
-        if (err < 0)
+        if (err < 0) {
+                kobject_put(cgroup_kobj);
                goto out;
+        }
        proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_freezing_or_frozen(struct task_struct *task)
+static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
 {
-        struct freezer *freezer;
+        enum freezer_state state = task_freezer(task)->state;
-        enum freezer_state state;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+}
+int cgroup_freezing_or_frozen(struct task_struct *task)
+{
+        int result;
        task_lock(task);
-        freezer = task_freezer(task);
+        result = __cgroup_freezing_or_frozen(task);
-        if (!freezer->css.cgroup->parent)
-                state = CGROUP_THAWED; /* root cgroup can't be frozen */
-        else
-                state = freezer->state;
        task_unlock(task);
+        return result;
-        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
        kfree(cgroup_freezer(cgroup));
 }
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
-{
-        return frozen(task) ||
-                (task_is_stopped_or_traced(task) && freezing(task));
-}
 /*
 * The call to cgroup_lock() in the freezer.state write method prevents
 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        /*
         * Anything frozen can't move or be moved to/from.
-         *
-         * Since orig_freezer->state == FROZEN means that @task has been
-         * frozen, so it's sufficient to check the latter condition.
         */
-        if (is_task_frozen_enough(task))
+        freezer = cgroup_freezer(new_cgroup);
+        if (freezer->state != CGROUP_THAWED)
                return -EBUSY;
-        freezer = cgroup_freezer(new_cgroup);
+        rcu_read_lock();
-        if (freezer->state == CGROUP_FROZEN)
+        if (__cgroup_freezing_or_frozen(task)) {
+                rcu_read_unlock();
                return -EBUSY;
+        }
+        rcu_read_unlock();
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (is_task_frozen_enough(c)) {
+                        if (__cgroup_freezing_or_frozen(c)) {
                                rcu_read_unlock();
                                return -EBUSY;
                        }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 /*
 * caller must hold freezer->lock
 */
-static void update_freezer_state(struct cgroup *cgroup,
+static void update_if_frozen(struct cgroup *cgroup,
                                 struct freezer *freezer)
 {
        struct cgroup_iter it;
        struct task_struct *task;
        unsigned int nfrozen = 0, ntotal = 0;
+        enum freezer_state old_state = freezer->state;
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (is_task_frozen_enough(task))
+                if (frozen(task))
                        nfrozen++;
        }
-        /*
+        if (old_state == CGROUP_THAWED) {
-         * Transition to FROZEN when no new tasks can be added ensures
+                BUG_ON(nfrozen > 0);
-         * that we never exist in the FROZEN state while there are unfrozen
+        } else if (old_state == CGROUP_FREEZING) {
-         * tasks.
+                if (nfrozen == ntotal)
-         */
+                        freezer->state = CGROUP_FROZEN;
-        if (nfrozen == ntotal)
+        } else { /* old_state == CGROUP_FROZEN */
-                freezer->state = CGROUP_FROZEN;
+                BUG_ON(nfrozen != ntotal);
-        else if (nfrozen > 0)
+        }
-                freezer->state = CGROUP_FREEZING;
-        else
-                freezer->state = CGROUP_THAWED;
        cgroup_iter_end(cgroup, &it);
 }
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
        if (state == CGROUP_FREEZING) {
                /* We change from FREEZING to FROZEN lazily if the cgroup was
                 * only partially frozen when we exitted write. */
-                update_freezer_state(cgroup, freezer);
+                update_if_frozen(cgroup, freezer);
                state = freezer->state;
        }
        spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        while ((task = cgroup_iter_next(cgroup, &it))) {
                if (!freeze_task(task, true))
                        continue;
-                if (is_task_frozen_enough(task))
+                if (frozen(task))
                        continue;
                if (!freezing(task) && !freezer_should_skip(task))
                        num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
        spin_lock_irq(&freezer->lock);
-        update_freezer_state(cgroup, freezer);
+        update_if_frozen(cgroup, freezer);
        if (goal_state == freezer->state)
                goto out;
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..c9e2ec0b34a8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
                struct compat_rlimit __user *rlim)
 {
        struct rlimit r;
-        int ret;
-        mm_segment_t old_fs = get_fs ();
-        if (resource >= RLIM_NLIMITS)
-                return -EINVAL;
        if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
            __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
                r.rlim_cur = RLIM_INFINITY;
        if (r.rlim_max == COMPAT_RLIM_INFINITY)
                r.rlim_max = RLIM_INFINITY;
-        set_fs(KERNEL_DS);
+        return do_prlimit(current, resource, &r, NULL);
-        ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
-        set_fs(old_fs);
-        return ret;
 }
 #ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
 #endif
-asmlinkage long compat_sys_getrlimit (unsigned int resource,
+asmlinkage long compat_sys_getrlimit(unsigned int resource,
                struct compat_rlimit __user *rlim)
 {
        struct rlimit r;
        int ret;
-        mm_segment_t old_fs = get_fs();
-        set_fs(KERNEL_DS);
+        ret = do_prlimit(current, resource, NULL, &r);
-        ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
-        set_fs(old_fs);
        if (!ret) {
                if (r.rlim_cur > COMPAT_RLIM_INFINITY)
                        r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -1137,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
        return 0;
 }
+/*
+ * Allocate user-space memory for the duration of a single system call,
+ * in order to marshall parameters inside a compat thunk.
+ */
+void __user *compat_alloc_user_space(unsigned long len)
+{
+        void __user *ptr;
+        /* If len would occupy more than half of the entire compat space... */
+        if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
+                return NULL;
+        ptr = arch_compat_alloc_user_space(len);
+        if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+                return NULL;
+        return ptr;
+}
+EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
 static const struct file_operations ikconfig_file_ops = {
        .owner = THIS_MODULE,
        .read = ikconfig_read_current,
+        .llseek = default_llseek,
 };
 static int __init ikconfig_init(void)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                return -EINVAL;
        cpu_hotplug_begin();
-        set_cpu_active(cpu, false);
        err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
        if (err) {
-                set_cpu_active(cpu, true);
                nr_calls--;
                __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
                printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
-                set_cpu_active(cpu, true);
                /* CPU didn't die: tell everyone.  Can't complain. */
                cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
-        set_cpu_active(cpu, true);
        /* Now call notifier in preparation. */
        cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..51b143e2a07a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
        /* for custom sched domain */
        int relax_domain_level;
-        /* used for walking a cpuset heirarchy */
+        /* used for walking a cpuset hierarchy */
        struct list_head stack_list;
 };
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
-        ret = security_task_setscheduler(tsk, 0, NULL);
+        ret = security_task_setscheduler(tsk);
        if (ret)
                return ret;
        if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        ret = security_task_setscheduler(c, 0, NULL);
+                        ret = security_task_setscheduler(c);
                        if (ret) {
                                rcu_read_unlock();
                                return ret;
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 * but making no active use of cpusets.
 *
 * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
 *
 * Called within get_online_cpus().  Needs to call cgroup_lock()
 * before calling generate_sched_domains().
 */
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
+void cpuset_update_active_cpus(void)
-                                unsigned long phase, void *unused_cpu)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        int ndoms;
-        switch (phase) {
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-                break;
-        default:
-                return NOTIFY_DONE;
-        }
        cgroup_lock();
        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        /* Have scheduler rebuild the domains */
        partition_sched_domains(ndoms, doms, attr);
-        return NOTIFY_OK;
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        hotcpu_notifier(cpuset_track_online_cpus, 0);
        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
        cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -22,10 +22,6 @@
 #define kdebug(FMT, ...) \
        printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
 #else
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
 #define kdebug(FMT, ...) \
        no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
 #endif
@@ -329,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
 /*
 * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
 */
 struct cred *prepare_exec_creds(void)
 {
@@ -388,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
-        mutex_init(&p->cred_guard_mutex);
        if (
 #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 8bc5eeffec8a..fec596da9bd0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
 * Copyright (C) 2000-2001 VERITAS Software Corporation.
 * Copyright (C) 2002-2004 Timesys Corporation
 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -47,6 +47,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/rcupdate.h>
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
 */
 atomic_t                        kgdb_active = ATOMIC_INIT(-1);
 EXPORT_SYMBOL_GPL(kgdb_active);
+static DEFINE_RAW_SPINLOCK(dbg_master_lock);
+static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
 /*
 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
 * bootup code (which might not have percpu set up yet):
 */
-static atomic_t                 passive_cpu_wait[NR_CPUS];
+static atomic_t                 masters_in_kgdb;
-static atomic_t                 cpu_in_kgdb[NR_CPUS];
+static atomic_t                 slaves_in_kgdb;
 static atomic_t                 kgdb_break_tasklet_var;
 atomic_t                        kgdb_setting_breakpoint;
@@ -457,26 +460,32 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
        return 1;
 }
-static void dbg_cpu_switch(int cpu, int next_cpu)
+static void dbg_touch_watchdogs(void)
 {
-        /* Mark the cpu we are switching away from as a slave when it
+        touch_softlockup_watchdog_sync();
-         * holds the kgdb_active token.  This must be done so that the
+        clocksource_touch_watchdog();
-         * that all the cpus wait in for the debug core will not enter
+        rcu_cpu_stall_reset();
-         * again as the master. */
-        if (cpu == atomic_read(&kgdb_active)) {
-                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
-                kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
-        }
-        kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
 }
-static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
+                int exception_state)
 {
        unsigned long flags;
        int sstep_tries = 100;
        int error;
-        int i, cpu;
+        int cpu;
        int trace_on = 0;
+        int online_cpus = num_online_cpus();
+        kgdb_info[ks->cpu].enter_kgdb++;
+        kgdb_info[ks->cpu].exception_state |= exception_state;
+        if (exception_state == DCPU_WANT_MASTER)
+                atomic_inc(&masters_in_kgdb);
+        else
+                atomic_inc(&slaves_in_kgdb);
+        kgdb_disable_hw_debug(ks->linux_regs);
 acquirelock:
        /*
         * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +498,15 @@ acquirelock:
        kgdb_info[cpu].task = current;
        kgdb_info[cpu].ret_state = 0;
        kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
-        /*
-         * Make sure the above info reaches the primary CPU before
-         * our cpu_in_kgdb[] flag setting does:
-         */
-        atomic_inc(&cpu_in_kgdb[cpu]);
-        if (exception_level == 1)
+        /* Make sure the above info reaches the primary CPU */
+        smp_mb();
+        if (exception_level == 1) {
+                if (raw_spin_trylock(&dbg_master_lock))
+                        atomic_xchg(&kgdb_active, cpu);
                goto cpu_master_loop;
+        }
        /*
         * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +518,12 @@ cpu_loop:
                        kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
                        goto cpu_master_loop;
                } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
-                        if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+                        if (raw_spin_trylock(&dbg_master_lock)) {
+                                atomic_xchg(&kgdb_active, cpu);
                                break;
+                        }
                } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
-                        if (!atomic_read(&passive_cpu_wait[cpu]))
+                        if (!raw_spin_is_locked(&dbg_slave_lock))
                                goto return_normal;
                } else {
 return_normal:
@@ -522,9 +534,12 @@ return_normal:
                                arch_kgdb_ops.correct_hw_break();
                        if (trace_on)
                                tracing_on();
-                        atomic_dec(&cpu_in_kgdb[cpu]);
+                        kgdb_info[cpu].exception_state &=
-                        touch_softlockup_watchdog_sync();
+                                ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
-                        clocksource_touch_watchdog();
+                        kgdb_info[cpu].enter_kgdb--;
+                        smp_mb__before_atomic_dec();
+                        atomic_dec(&slaves_in_kgdb);
+                        dbg_touch_watchdogs();
                        local_irq_restore(flags);
                        return 0;
                }
@@ -541,8 +556,8 @@ return_normal:
            (kgdb_info[cpu].task &&
             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog_sync();
+                raw_spin_unlock(&dbg_master_lock);
-                clocksource_touch_watchdog();
+                dbg_touch_watchdogs();
                local_irq_restore(flags);
                goto acquirelock;
@@ -563,16 +578,12 @@ return_normal:
        if (dbg_io_ops->pre_exception)
                dbg_io_ops->pre_exception();
-        kgdb_disable_hw_debug(ks->linux_regs);
        /*
         * Get the passive CPU lock which will hold all the non-primary
         * CPU in a spin state while the debugger is active
         */
-        if (!kgdb_single_step) {
+        if (!kgdb_single_step)
-                for (i = 0; i < NR_CPUS; i++)
+                raw_spin_lock(&dbg_slave_lock);
-                        atomic_inc(&passive_cpu_wait[i]);
-        }
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +594,9 @@ return_normal:
        /*
         * Wait for the other CPUs to be notified and be waiting for us:
         */
-        for_each_online_cpu(i) {
+        while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
-                while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
+                                atomic_read(&slaves_in_kgdb)) != online_cpus)
-                        cpu_relax();
+                cpu_relax();
-        }
        /*
         * At this point the primary processor is completely
@@ -605,6 +615,8 @@ cpu_master_loop:
                if (dbg_kdb_mode) {
                        kgdb_connected = 1;
                        error = kdb_stub(ks);
+                        if (error == -1)
+                                continue;
                        kgdb_connected = 0;
                } else {
                        error = gdb_serial_stub(ks);
@@ -613,7 +625,8 @@ cpu_master_loop:
                if (error == DBG_PASS_EVENT) {
                        dbg_kdb_mode = !dbg_kdb_mode;
                } else if (error == DBG_SWITCH_CPU_EVENT) {
-                        dbg_cpu_switch(cpu, dbg_switch_cpu);
+                        kgdb_info[dbg_switch_cpu].exception_state |=
+                                DCPU_NEXT_MASTER;
                        goto cpu_loop;
                } else {
                        kgdb_info[cpu].ret_state = error;
@@ -625,24 +638,11 @@ cpu_master_loop:
        if (dbg_io_ops->post_exception)
                dbg_io_ops->post_exception();
-        atomic_dec(&cpu_in_kgdb[ks->cpu]);
        if (!kgdb_single_step) {
-                for (i = NR_CPUS-1; i >= 0; i--)
+                raw_spin_unlock(&dbg_slave_lock);
-                        atomic_dec(&passive_cpu_wait[i]);
+                /* Wait till all the CPUs have quit from the debugger. */
-                /*
+                while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
-                 * Wait till all the CPUs have quit from the debugger,
+                        cpu_relax();
-                 * but allow a CPU that hit an exception and is
-                 * waiting to become the master to remain in the debug
-                 * core.
-                 */
-                for_each_online_cpu(i) {
-                        while (kgdb_do_roundup &&
-                               atomic_read(&cpu_in_kgdb[i]) &&
-                               !(kgdb_info[i].exception_state &
-                                 DCPU_WANT_MASTER))
-                                cpu_relax();
-                }
        }
 kgdb_restore:
@@ -653,12 +653,20 @@ kgdb_restore:
                else
                        kgdb_sstep_pid = 0;
        }
+        if (arch_kgdb_ops.correct_hw_break)
+                arch_kgdb_ops.correct_hw_break();
        if (trace_on)
                tracing_on();
+        kgdb_info[cpu].exception_state &=
+                ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+        kgdb_info[cpu].enter_kgdb--;
+        smp_mb__before_atomic_dec();
+        atomic_dec(&masters_in_kgdb);
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog_sync();
+        raw_spin_unlock(&dbg_master_lock);
-        clocksource_touch_watchdog();
+        dbg_touch_watchdogs();
        local_irq_restore(flags);
        return kgdb_info[cpu].ret_state;
@@ -676,7 +684,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
-        int ret;
        ks->cpu                 = raw_smp_processor_id();
        ks->ex_vector           = evector;
@@ -687,11 +694,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        if (kgdb_reenter_check(ks))
                return 0; /* Ouch, double exception ! */
-        kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+        if (kgdb_info[ks->cpu].enter_kgdb != 0)
-        ret = kgdb_cpu_enter(ks, regs);
+                return 0;
-        kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
-                                                DCPU_IS_SLAVE);
+        return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
-        return ret;
 }
 int kgdb_nmicallback(int cpu, void *regs)
@@ -704,12 +710,9 @@ int kgdb_nmicallback(int cpu, void *regs)
        ks->cpu                 = cpu;
        ks->linux_regs          = regs;
-        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+        if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
-            atomic_read(&kgdb_active) != -1 &&
+                        raw_spin_is_locked(&dbg_master_lock)) {
-            atomic_read(&kgdb_active) != cpu) {
+                kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
-                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
-                kgdb_cpu_enter(ks, regs);
-                kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
                return 0;
        }
 #endif
@@ -739,7 +742,7 @@ static struct console kgdbcons = {
 };
 #ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_dbg(int key, struct tty_struct *tty)
+static void sysrq_handle_dbg(int key)
 {
        if (!dbg_io_ops) {
                printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
        int                     exception_state;
        int                     ret_state;
        int                     irq_depth;
+        int                     enter_kgdb;
 };
 extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e8fd6868682d..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
 * Copyright (C) 2000-2001 VERITAS Software Corporation.
 * Copyright (C) 2002-2004 Timesys Corporation
 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -52,17 +52,6 @@ static unsigned long		gdb_regs[(NUMREGBYTES +
 * GDB remote protocol parser:
 */
-static int hex(char ch)
-{
-        if ((ch >= 'a') && (ch <= 'f'))
-                return ch - 'a' + 10;
-        if ((ch >= '0') && (ch <= '9'))
-                return ch - '0';
-        if ((ch >= 'A') && (ch <= 'F'))
-                return ch - 'A' + 10;
-        return -1;
-}
 #ifdef CONFIG_KGDB_KDB
 static int gdbstub_read_wait(void)
 {
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
                buffer[count] = 0;
                if (ch == '#') {
-                        xmitcsum = hex(gdbstub_read_wait()) << 4;
+                        xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
-                        xmitcsum += hex(gdbstub_read_wait());
+                        xmitcsum += hex_to_bin(gdbstub_read_wait());
                        if (checksum != xmitcsum)
                                /* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
 * buf.  Return a pointer to the last char put in buf (null). May
 * return an error.
 */
-int kgdb_mem2hex(char *mem, char *buf, int count)
+char *kgdb_mem2hex(char *mem, char *buf, int count)
 {
        char *tmp;
        int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
        tmp = buf + count;
        err = probe_kernel_read(tmp, mem, count);
-        if (!err) {
+        if (err)
-                while (count > 0) {
+                return NULL;
-                        buf = pack_hex_byte(buf, *tmp);
+        while (count > 0) {
-                        tmp++;
+                buf = pack_hex_byte(buf, *tmp);
-                        count--;
+                tmp++;
-                }
+                count--;
-                *buf = 0;
        }
+        *buf = 0;
-        return err;
+        return buf;
 }
 /*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
        tmp_hex = tmp_raw - 1;
        while (tmp_hex >= buf) {
                tmp_raw--;
-                *tmp_raw = hex(*tmp_hex--);
+                *tmp_raw = hex_to_bin(*tmp_hex--);
-                *tmp_raw |= hex(*tmp_hex--) << 4;
+                *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
        }
        return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
                (*ptr)++;
        }
        while (**ptr) {
-                hex_val = hex(**ptr);
+                hex_val = hex_to_bin(**ptr);
                if (hex_val < 0)
                        break;
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
        return probe_kernel_write(mem, c, size);
 }
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+        int i;
+        int idx = 0;
+        char *ptr = (char *)gdb_regs;
+        for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+                dbg_get_reg(i, ptr + idx, regs);
+                idx += dbg_reg_def[i].size;
+        }
+}
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+        int i;
+        int idx = 0;
+        char *ptr = (char *)gdb_regs;
+        for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+                dbg_set_reg(i, ptr + idx, regs);
+                idx += dbg_reg_def[i].size;
+        }
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
 /* Write memory due to an 'M' or 'X' packet. */
 static int write_mem_msg(int binary)
 {
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
 * remapped to negative TIDs.
 */
-#define BUF_THREAD_ID_SIZE      16
+#define BUF_THREAD_ID_SIZE      8
 static char *pack_threadid(char *pkt, unsigned char *id)
 {
-        char *limit;
+        unsigned char *limit;
+        int lzero = 1;
+        limit = id + (BUF_THREAD_ID_SIZE / 2);
+        while (id < limit) {
+                if (!lzero || *id != 0) {
+                        pkt = pack_hex_byte(pkt, *id);
+                        lzero = 0;
+                }
+                id++;
+        }
-        limit = pkt + BUF_THREAD_ID_SIZE;
+        if (lzero)
-        while (pkt < limit)
+                pkt = pack_hex_byte(pkt, 0);
-                pkt = pack_hex_byte(pkt, *id++);
        return pkt;
 }
 static void int_to_threadref(unsigned char *id, int value)
 {
-        unsigned char *scan;
+        put_unaligned_be32(value, id);
-        int i = 4;
-        scan = (unsigned char *)id;
-        while (i--)
-                *scan++ = 0;
-        put_unaligned_be32(value, scan);
 }
 static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
        pack_hex_byte(&remcom_out_buffer[1], ks->signo);
 }
-/* Handle the 'g' get registers request */
+static void gdb_get_regs_helper(struct kgdb_state *ks)
-static void gdb_cmd_getregs(struct kgdb_state *ks)
 {
        struct task_struct *thread;
        void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
                 */
                sleeping_thread_to_gdb_regs(gdb_regs, thread);
        }
+}
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+        gdb_get_regs_helper(ks);
        kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
 }
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
        char *ptr = &remcom_in_buffer[1];
        unsigned long length;
        unsigned long addr;
-        int err;
+        char *err;
        if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
                                        kgdb_hex2long(&ptr, &length) > 0) {
                err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-                if (err)
+                if (!err)
-                        error_packet(remcom_out_buffer, err);
+                        error_packet(remcom_out_buffer, -EINVAL);
        } else {
                error_packet(remcom_out_buffer, -EINVAL);
        }
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
                strcpy(remcom_out_buffer, "OK");
 }
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+        int i;
+        int offset = 0;
+        for (i = 0; i < regnum; i++)
+                offset += dbg_reg_def[i].size;
+        return kgdb_mem2hex((char *)gdb_regs + offset, out,
+                            dbg_reg_def[i].size);
+}
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+        unsigned long regnum;
+        char *ptr = &remcom_in_buffer[1];
+        kgdb_hex2long(&ptr, &regnum);
+        if (regnum >= DBG_MAX_REG_NUM) {
+                error_packet(remcom_out_buffer, -EINVAL);
+                return;
+        }
+        gdb_get_regs_helper(ks);
+        gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+        unsigned long regnum;
+        char *ptr = &remcom_in_buffer[1];
+        int i = 0;
+        kgdb_hex2long(&ptr, &regnum);
+        if (*ptr++ != '=' ||
+            !(!kgdb_usethread || kgdb_usethread == current) ||
+            !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+                error_packet(remcom_out_buffer, -EINVAL);
+                return;
+        }
+        memset(gdb_regs, 0, sizeof(gdb_regs));
+        while (i < sizeof(gdb_regs) * 2)
+                if (hex_to_bin(ptr[i]) >= 0)
+                        i++;
+                else
+                        break;
+        i = i / 2;
+        kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+        dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+        strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
 /* Handle the 'X' memory binary write bytes */
 static void gdb_cmd_binwrite(struct kgdb_state *ks)
 {
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 {
        struct task_struct *g;
        struct task_struct *p;
-        unsigned char thref[8];
+        unsigned char thref[BUF_THREAD_ID_SIZE];
        char *ptr;
        int i;
        int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        for_each_online_cpu(cpu) {
                                ks->thr_query = 0;
                                int_to_threadref(thref, -cpu - 2);
-                                pack_threadid(ptr, thref);
+                                ptr = pack_threadid(ptr, thref);
-                                ptr += BUF_THREAD_ID_SIZE;
                                *(ptr++) = ',';
                                i++;
                        }
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                do_each_thread(g, p) {
                        if (i >= ks->thr_query && !finished) {
                                int_to_threadref(thref, p->pid);
-                                pack_threadid(ptr, thref);
+                                ptr = pack_threadid(ptr, thref);
-                                ptr += BUF_THREAD_ID_SIZE;
                                *(ptr++) = ',';
                                ks->thr_query++;
                                if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
        int error = 0;
        int tmp;
-        /* Clear the out buffer. */
+        /* Initialize comm buffer and globals. */
        memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+        kgdb_usethread = kgdb_info[ks->cpu].task;
+        ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+        ks->pass_exception = 0;
        if (kgdb_connected) {
-                unsigned char thref[8];
+                unsigned char thref[BUF_THREAD_ID_SIZE];
                char *ptr;
                /* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
                put_packet(remcom_out_buffer);
        }
-        kgdb_usethread = kgdb_info[ks->cpu].task;
-        ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-        ks->pass_exception = 0;
        while (1) {
                error = 0;
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
                case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
                        gdb_cmd_memwrite(ks);
                        break;
+#if DBG_MAX_REG_NUM > 0
+                case 'p': /* pXX Return gdb register XX (in hex) */
+                        gdb_cmd_reg_get(ks);
+                        break;
+                case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+                        gdb_cmd_reg_set(ks);
+                        break;
+#endif /* DBG_MAX_REG_NUM > 0 */
                case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
                        gdb_cmd_binwrite(ks);
                        break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 75bd9b3ebbb7..20059ef4459a 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv)
        int i, bpno;
        kdb_bp_t *bp, *bp_check;
        int diag;
-        int free;
        char *symname = NULL;
        long offset = 0ul;
        int nextarg;
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv)
        /*
         * Find an empty bp structure to allocate
         */
-        free = KDB_MAXBPT;
        for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
                if (bp->bp_free)
                        break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
        }
        /* Set initial kdb state variables */
        KDB_STATE_CLEAR(KGDB_TRANS);
-        kdb_initial_cpu = ks->cpu;
+        kdb_initial_cpu = atomic_read(&kgdb_active);
        kdb_current_task = kgdb_info[ks->cpu].task;
        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
        /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
                ks->pass_exception = 1;
                KDB_FLAG_SET(CATASTROPHIC);
        }
-        kdb_initial_cpu = ks->cpu;
        if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
                KDB_STATE_CLEAR(SSBPT);
                KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
        return r;
 }
+EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..d7bda21a106b 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
        if (endp == arg) {
                /*
-                 * Try base 16, for us folks too lazy to type the
+                 * Also try base 16, for us folks too lazy to type the
                 * leading 0x...
                 */
                val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
        return 0;
 }
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+        char *endp;
+        u64 val;
+        val = simple_strtoull(arg, &endp, 0);
+        if (endp == arg) {
+                val = simple_strtoull(arg, &endp, 16);
+                if (endp == arg)
+                        return KDB_BADINT;
+        }
+        *value = val;
+        return 0;
+}
 /*
 * kdb_set - This function implements the 'set' command.  Alter an
 *      existing environment variable or create a new one.
@@ -1730,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
        int nextarg;
        long offset;
+        if (raw_smp_processor_id() != kdb_initial_cpu) {
+                kdb_printf("go must execute on the entry cpu, "
+                           "please use \"cpu %d\" and then execute go\n",
+                           kdb_initial_cpu);
+                return KDB_BADCPUNUM;
+        }
        if (argc == 1) {
-                if (raw_smp_processor_id() != kdb_initial_cpu) {
-                        kdb_printf("go <address> must be issued from the "
-                                   "initial cpu, do cpu %d first\n",
-                                   kdb_initial_cpu);
-                        return KDB_ARGCOUNT;
-                }
                nextarg = 1;
                diag = kdbgetaddrarg(argc, argv, &nextarg,
                                     &addr, &offset, NULL);
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
 */
 static int kdb_rd(int argc, const char **argv)
 {
-        int diag = kdb_check_regs();
+        int len = kdb_check_regs();
-        if (diag)
+#if DBG_MAX_REG_NUM > 0
-                return diag;
+        int i;
+        char *rname;
+        int rsize;
+        u64 reg64;
+        u32 reg32;
+        u16 reg16;
+        u8 reg8;
+        if (len)
+                return len;
+        for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+                rsize = dbg_reg_def[i].size * 2;
+                if (rsize > 16)
+                        rsize = 2;
+                if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+                        len = 0;
+                        kdb_printf("\n");
+                }
+                if (len)
+                        len += kdb_printf("  ");
+                switch(dbg_reg_def[i].size * 8) {
+                case 8:
+                        rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+                        if (!rname)
+                                break;
+                        len += kdb_printf("%s: %02x", rname, reg8);
+                        break;
+                case 16:
+                        rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+                        if (!rname)
+                                break;
+                        len += kdb_printf("%s: %04x", rname, reg16);
+                        break;
+                case 32:
+                        rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+                        if (!rname)
+                                break;
+                        len += kdb_printf("%s: %08x", rname, reg32);
+                        break;
+                case 64:
+                        rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+                        if (!rname)
+                                break;
+                        len += kdb_printf("%s: %016llx", rname, reg64);
+                        break;
+                default:
+                        len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+                }
+        }
+        kdb_printf("\n");
+#else
+        if (len)
+                return len;
        kdb_dumpregs(kdb_current_regs);
+#endif
        return 0;
 }
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
 * kdb_rm - This function implements the 'rm' (register modify)  command.
 *      rm register-name new-contents
 * Remarks:
- *      Currently doesn't allow modification of control or
+ *      Allows register modification with the same restrictions as gdb
- *      debug registers.
 */
 static int kdb_rm(int argc, const char **argv)
 {
+#if DBG_MAX_REG_NUM > 0
        int diag;
-        int ind = 0;
+        const char *rname;
-        unsigned long contents;
+        int i;
+        u64 reg64;
+        u32 reg32;
+        u16 reg16;
+        u8 reg8;
        if (argc != 2)
                return KDB_ARGCOUNT;
        /*
         * Allow presence or absence of leading '%' symbol.
         */
-        if (argv[1][0] == '%')
+        rname = argv[1];
-                ind = 1;
+        if (*rname == '%')
+                rname++;
-        diag = kdbgetularg(argv[2], &contents);
+        diag = kdbgetu64arg(argv[2], &reg64);
        if (diag)
                return diag;
        diag = kdb_check_regs();
        if (diag)
                return diag;
+        diag = KDB_BADREG;
+        for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+                if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+                        diag = 0;
+                        break;
+                }
+        }
+        if (!diag) {
+                switch(dbg_reg_def[i].size * 8) {
+                case 8:
+                        reg8 = reg64;
+                        dbg_set_reg(i, &reg8, kdb_current_regs);
+                        break;
+                case 16:
+                        reg16 = reg64;
+                        dbg_set_reg(i, &reg16, kdb_current_regs);
+                        break;
+                case 32:
+                        reg32 = reg64;
+                        dbg_set_reg(i, &reg32, kdb_current_regs);
+                        break;
+                case 64:
+                        dbg_set_reg(i, &reg64, kdb_current_regs);
+                        break;
+                }
+        }
+        return diag;
+#else
        kdb_printf("ERROR: Register set currently not implemented\n");
-        return 0;
+    return 0;
+#endif
 }
 #if defined(CONFIG_MAGIC_SYSRQ)
@@ -1821,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv)
        if (argc != 1)
                return KDB_ARGCOUNT;
        kdb_trap_printk++;
-        __handle_sysrq(*argv[1], NULL, 0);
+        __handle_sysrq(*argv[1], false);
        kdb_trap_printk--;
        return 0;
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val)
 */
 static int kdb_summary(int argc, const char **argv)
 {
+        struct timespec now;
        struct kdb_tm tm;
        struct sysinfo val;
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv)
        kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
        kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
-        kdb_gmtime(&xtime, &tm);
+        now = __current_kernel_time();
+        kdb_gmtime(&now, &tm);
        kdb_printf("date       %04d-%02d-%02d %02d:%02d:%02d "
                   "tz_minuteswest %d\n",
                1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
@@ -2673,6 +2783,8 @@ int kdb_register_repeat(char *cmd,
        return 0;
 }
+EXPORT_SYMBOL_GPL(kdb_register_repeat);
 /*
 * kdb_register - Compatibility register function for commands that do
@@ -2695,6 +2807,7 @@ int kdb_register(char *cmd,
        return kdb_register_repeat(cmd, func, usage, help, minlen,
                                   KDB_REPEAT_NONE);
 }
+EXPORT_SYMBOL_GPL(kdb_register);
 /*
 * kdb_unregister - This function is used to unregister a kernel
@@ -2713,7 +2826,7 @@ int kdb_unregister(char *cmd)
        /*
         *  find the command.
         */
-        for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+        for_each_kdbcmd(kp, i) {
                if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
                        kp->cmd_name = NULL;
                        return 0;
@@ -2723,6 +2836,7 @@ int kdb_unregister(char *cmd)
        /* Couldn't find it.  */
        return 1;
 }
+EXPORT_SYMBOL_GPL(kdb_unregister);
 /* Initialize the kdb command table. */
 static void __init kdb_inittab(void)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
 #include <linux/kgdb.h>
 #include "../debug_core.h"
-/* Kernel Debugger Error codes.  Must not overlap with command codes. */
-#define KDB_NOTFOUND    (-1)
-#define KDB_ARGCOUNT    (-2)
-#define KDB_BADWIDTH    (-3)
-#define KDB_BADRADIX    (-4)
-#define KDB_NOTENV      (-5)
-#define KDB_NOENVVALUE  (-6)
-#define KDB_NOTIMP      (-7)
-#define KDB_ENVFULL     (-8)
-#define KDB_ENVBUFFULL  (-9)
-#define KDB_TOOMANYBPT  (-10)
-#define KDB_TOOMANYDBREGS (-11)
-#define KDB_DUPBPT      (-12)
-#define KDB_BPTNOTFOUND (-13)
-#define KDB_BADMODE     (-14)
-#define KDB_BADINT      (-15)
-#define KDB_INVADDRFMT  (-16)
-#define KDB_BADREG      (-17)
-#define KDB_BADCPUNUM   (-18)
-#define KDB_BADLENGTH   (-19)
-#define KDB_NOBP        (-20)
-#define KDB_BADADDR     (-21)
 /* Kernel Debugger Command codes.  Must not overlap with error codes. */
 #define KDB_CMD_GO      (-1001)
 #define KDB_CMD_CPU     (-1002)
@@ -93,17 +70,6 @@
 */
 #define KDB_MAXBPT      16
-/* Maximum number of arguments to a function  */
-#define KDB_MAXARGS    16
-typedef enum {
-        KDB_REPEAT_NONE = 0,    /* Do not repeat this command */
-        KDB_REPEAT_NO_ARGS,     /* Repeat the command without arguments */
-        KDB_REPEAT_WITH_ARGS,   /* Repeat the command including its arguments */
-} kdb_repeat_t;
-typedef int (*kdb_func_t)(int, const char **);
 /* Symbol table format returned by kallsyms. */
 typedef struct __ksymtab {
                unsigned long value;    /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
 extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
 /* Exported Symbols for kernel loadable modules to use. */
-extern int kdb_register(char *, kdb_func_t, char *, char *, short);
-extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
-                               short, kdb_repeat_t);
-extern int kdb_unregister(char *);
 extern int kdb_getarea_size(void *, unsigned long, size_t);
 extern int kdb_putarea_size(unsigned long, void *, size_t);
@@ -144,9 +105,8 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
 extern int kdb_putword(unsigned long, unsigned long, size_t);
 extern int kdbgetularg(const char *, unsigned long *);
-extern int kdb_set(int, const char **);
+extern int kdbgetu64arg(const char *, u64 *);
 extern char *kdbgetenv(const char *);
-extern int kdbgetintenv(const char *, int *);
 extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
                         long *, char **);
 extern int kdbgetsymval(const char *, kdb_symtab_t *);
@@ -257,7 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
 extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
 extern void kdb_meminfo_proc_show(void);
-extern const char *kdb_walk_kallsyms(loff_t *pos);
 extern char *kdb_getstr(char *, size_t, char *);
 /* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 45344d5c53dd..6b2485dcb050 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -82,8 +82,8 @@ static char *kdb_name_table[100];	/* arbitrary size */
 int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
 {
        int ret = 0;
-        unsigned long symbolsize;
+        unsigned long symbolsize = 0;
-        unsigned long offset;
+        unsigned long offset = 0;
 #define knt1_size 128           /* must be >= kallsyms table size */
        char *knt1 = NULL;
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * early_res, could be used to replace bootmem
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/early_res.h>
-#include <linux/slab.h>
-#include <linux/kmemleak.h>
-/*
- * Early reserved memory areas.
- */
-/*
- * need to make sure this one is bigger enough before
- * find_fw_memmap_area could be used
- */
-#define MAX_EARLY_RES_X 32
-struct early_res {
-        u64 start, end;
-        char name[15];
-        char overlap_ok;
-};
-static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
-static int max_early_res __initdata = MAX_EARLY_RES_X;
-static struct early_res *early_res __initdata = &early_res_x[0];
-static int early_res_count __initdata;
-static int __init find_overlapped_early(u64 start, u64 end)
-{
-        int i;
-        struct early_res *r;
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                r = &early_res[i];
-                if (end > r->start && start < r->end)
-                        break;
-        }
-        return i;
-}
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
-        int j;
-        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
-                ;
-        memmove(&early_res[i], &early_res[i + 1],
-               (j - 1 - i) * sizeof(struct early_res));
-        early_res[j - 1].end = 0;
-        early_res_count--;
-}
-static void __init drop_range_partial(int i, u64 start, u64 end)
-{
-        u64 common_start, common_end;
-        u64 old_start, old_end;
-        old_start = early_res[i].start;
-        old_end = early_res[i].end;
-        common_start = max(old_start, start);
-        common_end = min(old_end, end);
-        /* no overlap ? */
-        if (common_start >= common_end)
-                return;
-        if (old_start < common_start) {
-                /* make head segment */
-                early_res[i].end = common_start;
-                if (old_end > common_end) {
-                        char name[15];
-                        /*
-                         * Save a local copy of the name, since the
-                         * early_res array could get resized inside
-                         * reserve_early_without_check() ->
-                         * __check_and_double_early_res(), which would
-                         * make the current name pointer invalid.
-                         */
-                        strncpy(name, early_res[i].name,
-                                         sizeof(early_res[i].name) - 1);
-                        /* add another for left over on tail */
-                        reserve_early_without_check(common_end, old_end, name);
-                }
-                return;
-        } else {
-                if (old_end > common_end) {
-                        /* reuse the entry for tail left */
-                        early_res[i].start = common_end;
-                        return;
-                }
-                /* all covered */
-                drop_range(i);
-        }
-}
-/*
- * Split any existing ranges that:
- *  1) are marked 'overlap_ok', and
- *  2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range.  Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
- */
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
-{
-        int i;
-        struct early_res *r;
-        u64 lower_start, lower_end;
-        u64 upper_start, upper_end;
-        char name[15];
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                r = &early_res[i];
-                /* Continue past non-overlapping ranges */
-                if (end <= r->start || start >= r->end)
-                        continue;
-                /*
-                 * Leave non-ok overlaps as is; let caller
-                 * panic "Overlapping early reservations"
-                 * when it hits this overlap.
-                 */
-                if (!r->overlap_ok)
-                        return;
-                /*
-                 * We have an ok overlap.  We will drop it from the early
-                 * reservation map, and add back in any non-overlapping
-                 * portions (lower or upper) as separate, overlap_ok,
-                 * non-overlapping ranges.
-                 */
-                /* 1. Note any non-overlapping (lower or upper) ranges. */
-                strncpy(name, r->name, sizeof(name) - 1);
-                lower_start = lower_end = 0;
-                upper_start = upper_end = 0;
-                if (r->start < start) {
-                        lower_start = r->start;
-                        lower_end = start;
-                }
-                if (r->end > end) {
-                        upper_start = end;
-                        upper_end = r->end;
-                }
-                /* 2. Drop the original ok overlapping range */
-                drop_range(i);
-                i--;            /* resume for-loop on copied down entry */
-                /* 3. Add back in any non-overlapping ranges. */
-                if (lower_end)
-                        reserve_early_overlap_ok(lower_start, lower_end, name);
-                if (upper_end)
-                        reserve_early_overlap_ok(upper_start, upper_end, name);
-        }
-}
-static void __init __reserve_early(u64 start, u64 end, char *name,
-                                                int overlap_ok)
-{
-        int i;
-        struct early_res *r;
-        i = find_overlapped_early(start, end);
-        if (i >= max_early_res)
-                panic("Too many early reservations");
-        r = &early_res[i];
-        if (r->end)
-                panic("Overlapping early reservations "
-                      "%llx-%llx %s to %llx-%llx %s\n",
-                      start, end - 1, name ? name : "", r->start,
-                      r->end - 1, r->name);
-        r->start = start;
-        r->end = end;
-        r->overlap_ok = overlap_ok;
-        if (name)
-                strncpy(r->name, name, sizeof(r->name) - 1);
-        early_res_count++;
-}
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first.  We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
-        drop_overlaps_that_are_ok(start, end);
-        __reserve_early(start, end, name, 1);
-}
-static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
-{
-        u64 start, end, size, mem;
-        struct early_res *new;
-        /* do we have enough slots left ? */
-        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
-                return;
-        /* double it */
-        mem = -1ULL;
-        size = sizeof(struct early_res) * max_early_res * 2;
-        if (early_res == early_res_x)
-                start = 0;
-        else
-                start = early_res[0].end;
-        end = ex_start;
-        if (start + size < end)
-                mem = find_fw_memmap_area(start, end, size,
-                                         sizeof(struct early_res));
-        if (mem == -1ULL) {
-                start = ex_end;
-                end = get_max_mapped();
-                if (start + size < end)
-                        mem = find_fw_memmap_area(start, end, size,
-                                                 sizeof(struct early_res));
-        }
-        if (mem == -1ULL)
-                panic("can not find more space for early_res array");
-        new = __va(mem);
-        /* save the first one for own */
-        new[0].start = mem;
-        new[0].end = mem + size;
-        new[0].overlap_ok = 0;
-        /* copy old to new */
-        if (early_res == early_res_x) {
-                memcpy(&new[1], &early_res[0],
-                         sizeof(struct early_res) * max_early_res);
-                memset(&new[max_early_res+1], 0,
-                         sizeof(struct early_res) * (max_early_res - 1));
-                early_res_count++;
-        } else {
-                memcpy(&new[1], &early_res[1],
-                         sizeof(struct early_res) * (max_early_res - 1));
-                memset(&new[max_early_res], 0,
-                         sizeof(struct early_res) * max_early_res);
-        }
-        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
-        early_res = new;
-        max_early_res *= 2;
-        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
-                max_early_res, mem, mem + size - 1);
-}
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
-        if (start >= end)
-                return;
-        __check_and_double_early_res(start, end);
-        drop_overlaps_that_are_ok(start, end);
-        __reserve_early(start, end, name, 0);
-}
-void __init reserve_early_without_check(u64 start, u64 end, char *name)
-{
-        struct early_res *r;
-        if (start >= end)
-                return;
-        __check_and_double_early_res(start, end);
-        r = &early_res[early_res_count];
-        r->start = start;
-        r->end = end;
-        r->overlap_ok = 0;
-        if (name)
-                strncpy(r->name, name, sizeof(r->name) - 1);
-        early_res_count++;
-}
-void __init free_early(u64 start, u64 end)
-{
-        struct early_res *r;
-        int i;
-        kmemleak_free_part(__va(start), end - start);
-        i = find_overlapped_early(start, end);
-        r = &early_res[i];
-        if (i >= max_early_res || r->end != end || r->start != start)
-                panic("free_early on not reserved area: %llx-%llx!",
-                         start, end - 1);
-        drop_range(i);
-}
-void __init free_early_partial(u64 start, u64 end)
-{
-        struct early_res *r;
-        int i;
-        kmemleak_free_part(__va(start), end - start);
-        if (start == end)
-                return;
-        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
-                return;
-try_next:
-        i = find_overlapped_early(start, end);
-        if (i >= max_early_res)
-                return;
-        r = &early_res[i];
-        /* hole ? */
-        if (r->end >= end && r->start <= start) {
-                drop_range_partial(i, start, end);
-                return;
-        }
-        drop_range_partial(i, start, end);
-        goto try_next;
-}
-#ifdef CONFIG_NO_BOOTMEM
-static void __init subtract_early_res(struct range *range, int az)
-{
-        int i, count;
-        u64 final_start, final_end;
-        int idx = 0;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        /* need to skip first one ?*/
-        if (early_res != early_res_x)
-                idx = 1;
-#define DEBUG_PRINT_EARLY_RES 1
-#if DEBUG_PRINT_EARLY_RES
-        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
-#endif
-        for (i = idx; i < count; i++) {
-                struct early_res *r = &early_res[i];
-#if DEBUG_PRINT_EARLY_RES
-                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
-                        r->start, r->end, r->name);
-#endif
-                final_start = PFN_DOWN(r->start);
-                final_end = PFN_UP(r->end);
-                if (final_start >= final_end)
-                        continue;
-                subtract_range(range, az, final_start, final_end);
-        }
-}
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
-        int i, count;
-        u64 start = 0, end;
-        u64 size;
-        u64 mem;
-        struct range *range;
-        int nr_range;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        count *= 2;
-        size = sizeof(struct range) * count;
-        end = get_max_mapped();
-#ifdef MAX_DMA32_PFN
-        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
-                start = MAX_DMA32_PFN << PAGE_SHIFT;
-#endif
-        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
-        if (mem == -1ULL)
-                panic("can not find more space for range free");
-        range = __va(mem);
-        /* use early_node_map[] and early_res to get range array at first */
-        memset(range, 0, size);
-        nr_range = 0;
-        /* need to go over early_node_map to find out good range for node */
-        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
-#ifdef CONFIG_X86_32
-        subtract_range(range, count, max_low_pfn, -1ULL);
-#endif
-        subtract_early_res(range, count);
-        nr_range = clean_sort_range(range, count);
-        /* need to clear it ? */
-        if (nodeid == MAX_NUMNODES) {
-                memset(&early_res[0], 0,
-                         sizeof(struct early_res) * max_early_res);
-                early_res = NULL;
-                max_early_res = 0;
-        }
-        *rangep = range;
-        return nr_range;
-}
-#else
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
-        int i, count;
-        u64 final_start, final_end;
-        int idx = 0;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        /* need to skip first one ?*/
-        if (early_res != early_res_x)
-                idx = 1;
-        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
-                         count - idx, max_early_res, start, end);
-        for (i = idx; i < count; i++) {
-                struct early_res *r = &early_res[i];
-                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
-                        r->start, r->end, r->name);
-                final_start = max(start, r->start);
-                final_end = min(end, r->end);
-                if (final_start >= final_end) {
-                        printk(KERN_CONT "\n");
-                        continue;
-                }
-                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
-                        final_start, final_end);
-                reserve_bootmem_generic(final_start, final_end - final_start,
-                                BOOTMEM_DEFAULT);
-        }
-        /* clear them */
-        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
-        early_res = NULL;
-        max_early_res = 0;
-        early_res_count = 0;
-}
-#endif
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
-        int i;
-        u64 addr = *addrp;
-        int changed = 0;
-        struct early_res *r;
-again:
-        i = find_overlapped_early(addr, addr + size);
-        r = &early_res[i];
-        if (i < max_early_res && r->end) {
-                *addrp = addr = round_up(r->end, align);
-                changed = 1;
-                goto again;
-        }
-        return changed;
-}
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
-{
-        int i;
-        u64 addr = *addrp, last;
-        u64 size = *sizep;
-        int changed = 0;
-again:
-        last = addr + size;
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                struct early_res *r = &early_res[i];
-                if (last > r->start && addr < r->start) {
-                        size = r->start - addr;
-                        changed = 1;
-                        goto again;
-                }
-                if (last > r->end && addr < r->end) {
-                        addr = round_up(r->end, align);
-                        size = last - addr;
-                        changed = 1;
-                        goto again;
-                }
-                if (last <= r->end && addr >= r->start) {
-                        (*sizep)++;
-                        return 0;
-                }
-        }
-        if (changed) {
-                *addrp = addr;
-                *sizep = size;
-        }
-        return changed;
-}
-/*
- * Find a free area with specified alignment in a specific range.
- * only with the area.between start to end is active range from early_node_map
- * so they are good as RAM
- */
-u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
-                         u64 size, u64 align)
-{
-        u64 addr, last;
-        addr = round_up(ei_start, align);
-        if (addr < start)
-                addr = round_up(start, align);
-        if (addr >= ei_last)
-                goto out;
-        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
-                ;
-        last = addr + size;
-        if (last > ei_last)
-                goto out;
-        if (last > end)
-                goto out;
-        return addr;
-out:
-        return -1ULL;
-}
-u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
-                         u64 *sizep, u64 align)
-{
-        u64 addr, last;
-        addr = round_up(ei_start, align);
-        if (addr < start)
-                addr = round_up(start, align);
-        if (addr >= ei_last)
-                goto out;
-        *sizep = ei_last - addr;
-        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
-                ;
-        last = addr + *sizep;
-        if (last > ei_last)
-                goto out;
-        return addr;
-out:
-        return -1ULL;
-}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index dd62f8e714ca..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -134,23 +134,14 @@ unregister:
        return 0;
 }
-int
+int __set_personality(unsigned int personality)
-__set_personality(unsigned int personality)
 {
-        struct exec_domain      *ep, *oep;
+        struct exec_domain *oep = current_thread_info()->exec_domain;
-        ep = lookup_exec_domain(personality);
-        if (ep == current_thread_info()->exec_domain) {
-                current->personality = personality;
-                module_put(ep->module);
-                return 0;
-        }
+        current_thread_info()->exec_domain = lookup_exec_domain(personality);
        current->personality = personality;
-        oep = current_thread_info()->exec_domain;
-        current_thread_info()->exec_domain = ep;
        module_put(oep->module);
        return 0;
 }
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
 {
        unsigned int old = current->personality;
-        if (personality != 0xffffffff) {
+        if (personality != 0xffffffff)
                set_personality(personality);
-                if (current->personality != personality)
-                        return -EINVAL;
-        }
        return old;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67b564a..b194febf5799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -149,9 +150,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-#ifdef CONFIG_PERF_EVENTS
+        perf_event_delayed_put(tsk);
-        WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
 }
@@ -689,6 +688,8 @@ static void exit_mm(struct task_struct * tsk)
        enter_lazy_tlb(mm, current);
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
@@ -702,6 +703,8 @@ static void exit_mm(struct task_struct * tsk)
 * space.
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *thread;
@@ -771,9 +774,12 @@ static void forget_original_parent(struct task_struct *father)
        struct task_struct *p, *n, *reaper;
        LIST_HEAD(dead_children);
-        exit_ptrace(father);
        write_lock_irq(&tasklist_lock);
+        /*
+         * Note that exit_ptrace() and find_new_reaper() might
+         * drop tasklist_lock and reacquire it.
+         */
+        exit_ptrace(father);
        reaper = find_new_reaper(father);
        list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -1383,8 +1389,7 @@ static int wait_task_stopped(struct wait_opts *wo,
        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;
-        /* don't need the RCU readlock here as we're holding a spinlock */
+        uid = task_uid(p);
-        uid = __task_cred(p)->uid;
 unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
@@ -1457,7 +1462,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
-        uid = __task_cred(p)->uid;
+        uid = task_uid(p);
        spin_unlock_irq(&p->sighand->siglock);
        pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -300,7 +301,7 @@ out:
 #ifdef CONFIG_MMU
 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-        struct vm_area_struct *mpnt, *tmp, **pprev;
+        struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
        struct rb_node **rb_link, *rb_parent;
        int retval;
        unsigned long charge;
@@ -328,6 +329,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        if (retval)
                goto out;
+        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
                struct file *file;
@@ -355,11 +357,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (IS_ERR(pol))
                        goto fail_nomem_policy;
                vma_set_policy(tmp, pol);
+                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~VM_LOCKED;
-                tmp->vm_mm = mm;
+                tmp->vm_next = tmp->vm_prev = NULL;
-                tmp->vm_next = NULL;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +394,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                 */
                *pprev = tmp;
                pprev = &tmp->vm_next;
+                tmp->vm_prev = prev;
+                prev = tmp;
                __vma_link_rb(mm, tmp, rb_link, rb_parent);
                rb_link = &tmp->vm_rb.rb_right;
@@ -485,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        atomic_set(&mm->oom_disable_count, 0);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -738,6 +743,8 @@ good_mm:
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_inc(&mm->oom_disable_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
@@ -752,13 +759,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                if (fs->in_exec) {
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
@@ -899,6 +906,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        tty_audit_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
+        sig->oom_score_adj = current->signal->oom_score_adj;
+        mutex_init(&sig->cred_guard_mutex);
        return 0;
 }
@@ -907,7 +917,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
-        new_flags &= ~PF_SUPERPRIV;
+        new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
        new_flags |= PF_FORKNOEXEC;
        new_flags |= PF_STARTING;
        p->flags = new_flags;
@@ -1295,8 +1305,13 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm)
+        if (p->mm) {
+                task_lock(p);
+                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&p->mm->oom_disable_count);
+                task_unlock(p);
                mmput(p->mm);
+        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -1675,13 +1690,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                if (new_fs) {
                        fs = current->fs;
-                        write_lock(&fs->lock);
+                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                }
                if (new_mm) {
@@ -1689,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        active_mm = current->active_mm;
                        current->mm = new_mm;
                        current->active_mm = new_mm;
+                        if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                                atomic_dec(&mm->oom_disable_count);
+                                atomic_inc(&new_mm->oom_disable_count);
+                        }
                        activate_mm(active_mm, new_mm);
                        new_mm = mm;
                }
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..6c683b37f2ce 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
 /**
 * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list:               priority-sorted list of tasks waiting on this futex
 * @task:               the task waiting on the futex
 * @lock_ptr:           the hash bucket lock
 * @key:                the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
@@ -168,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-                atomic_inc(&key->shared.inode->i_count);
+                ihold(key->shared.inode);
                break;
        case FUT_OFF_MMSHARED:
                atomic_inc(&key->private.mm->mm_count);
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                         */
                        pi_state = this->pi_state;
                        /*
-                         * Userspace might have messed up non PI and PI futexes
+                         * Userspace might have messed up non-PI and PI futexes
                         */
                        if (unlikely(!pi_state))
                                return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
        /*
         * We set q->lock_ptr = NULL _before_ we wake up the task. If
-         * a non futex wake up happens on another CPU then the task
+         * a non-futex wake up happens on another CPU then the task
-         * might exit and p would dereference a non existing task
+         * might exit and p would dereference a non-existing task
         * struct. Prevent this by holding a reference on p across the
         * wake up.
         */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1:      source futex user address
+ * @uaddr1:     source futex user address
- * uaddr2:      target futex user address
+ * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
- * nr_wake:     number of waiters to wake (must be 1 for requeue_pi)
+ * @uaddr2:     target futex user address
- * nr_requeue:  number of waiters to requeue (0-INT_MAX)
+ * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
- * requeue_pi:  if we are attempting to requeue from a non-pi futex to a
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval:     @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
 *              pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
 /* The key must be already stored in q->key. */
 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+        __acquires(&hb->lock)
 {
        struct futex_hash_bucket *hb;
-        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
-        drop_futex_key_refs(&q->key);
 }
 /**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 * an example).
 */
 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
 {
        int prio;
@@ -1471,6 +1475,7 @@ retry:
 * and dropped here.
 */
 static void unqueue_me_pi(struct futex_q *q)
+        __releases(q->lock_ptr)
 {
        WARN_ON(plist_node_empty(&q->list));
        plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
        q->pi_state = NULL;
        spin_unlock(q->lock_ptr);
-        drop_futex_key_refs(&q->key);
 }
 /*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
        }
 retry:
-        /* Prepare to wait on uaddr. */
+        /*
+         * Prepare to wait on uaddr. On success, holds hb lock and increments
+         * q.key refs.
+         */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
                goto out;
@@ -1822,28 +1828,27 @@ retry:
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
+        /* unqueue_me() drops q.key ref */
        if (!unqueue_me(&q))
-                goto out_put_key;
+                goto out;
        ret = -ETIMEDOUT;
        if (to && !to->task)
-                goto out_put_key;
+                goto out;
        /*
         * We expect signal_pending(current), but we might be the
         * victim of a spurious wakeup as well.
         */
-        if (!signal_pending(current)) {
+        if (!signal_pending(current))
-                put_futex_key(fshared, &q.key);
                goto retry;
-        }
        ret = -ERESTARTSYS;
        if (!abs_time)
-                goto out_put_key;
+                goto out;
        restart = &current_thread_info()->restart_block;
        restart->fn = futex_wait_restart;
-        restart->futex.uaddr = (u32 *)uaddr;
+        restart->futex.uaddr = uaddr;
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
        ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
-        put_futex_key(fshared, &q.key);
 out:
        if (to) {
                hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
-        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+        u32 __user *uaddr = restart->futex.uaddr;
        int fshared = 0;
        ktime_t t, *tp = NULL;
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
-        /* Prepare to wait on uaddr. */
+        /*
+         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+         * count.
+         */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * In order for us to be here, we know our q.key == key2, and since
         * we took the hb->lock above, we also know that futex_requeue() has
         * completed and we no longer have to concern ourselves with a wakeup
-         * race with the atomic proxy lock acquition by the requeue code.
+         * race with the atomic proxy lock acquisition by the requeue code. The
+         * futex_requeue dropped our key1 reference and incremented our key2
+         * reference count.
         */
        /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
 */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
                                     struct robust_list __user * __user *head,
-                                     int *pi)
+                                     unsigned int *pi)
 {
        unsigned long uentry;
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
         * of the complex code paths. Also we want to prevent
         * registration of robust lists in that case. NULL is
         * guaranteed to fault and we get -EFAULT on functional
-         * implementation, the non functional ones will return
+         * implementation, the non-functional ones will return
         * -ENOSYS.
         */
        curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
 */
 static inline int
 fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-                   compat_uptr_t __user *head, int *pi)
+                   compat_uptr_t __user *head, unsigned int *pi)
 {
        if (get_user(*uentry, head))
                return -EFAULT;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
 * @children: child nodes
 * @all: list head for list of all nodes
 * @parent: parent node
- * @info: associated profiling data structure if not a directory
+ * @loaded_info: array of pointers to profiling data sets for loaded object
- * @ghost: when an object file containing profiling data is unloaded we keep a
+ *   files.
- *         copy of the profiling data here to allow collecting coverage data
+ * @num_loaded: number of profiling data sets for loaded object files.
- *         for cleanup code. Such a node is called a "ghost".
+ * @unloaded_info: accumulated copy of profiling data sets for unloaded
+ *   object files. Used only when gcov_persist=1.
 * @dentry: main debugfs entry, either a directory or data file
 * @links: associated symbolic links
 * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
        struct list_head children;
        struct list_head all;
        struct gcov_node *parent;
-        struct gcov_info *info;
+        struct gcov_info **loaded_info;
-        struct gcov_info *ghost;
+        struct gcov_info *unloaded_info;
        struct dentry *dentry;
        struct dentry **links;
+        int num_loaded;
        char name[0];
 };
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
 };
 /*
- * Return the profiling data set for a given node. This can either be the
+ * Return a profiling data set associated with the given node. This is
- * original profiling data structure or a duplicate (also called "ghost")
+ * either a data set for a loaded object file or a data set copy in case
- * in case the associated object file has been unloaded.
+ * all associated object files have been unloaded.
 */
 static struct gcov_info *get_node_info(struct gcov_node *node)
 {
-        if (node->info)
+        if (node->num_loaded > 0)
-                return node->info;
+                return node->loaded_info[0];
-        return node->ghost;
+        return node->unloaded_info;
+}
+/*
+ * Return a newly allocated profiling data set which contains the sum of
+ * all profiling data associated with the given node.
+ */
+static struct gcov_info *get_accumulated_info(struct gcov_node *node)
+{
+        struct gcov_info *info;
+        int i = 0;
+        if (node->unloaded_info)
+                info = gcov_info_dup(node->unloaded_info);
+        else
+                info = gcov_info_dup(node->loaded_info[i++]);
+        if (!info)
+                return NULL;
+        for (; i < node->num_loaded; i++)
+                gcov_info_add(info, node->loaded_info[i]);
+        return info;
 }
 /*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
        mutex_lock(&node_lock);
        /*
         * Read from a profiling data copy to minimize reference tracking
-         * complexity and concurrent access.
+         * complexity and concurrent access and to keep accumulating multiple
+         * profiling data sets associated with one node simple.
         */
-        info = gcov_info_dup(get_node_info(node));
+        info = get_accumulated_info(node);
        if (!info)
                goto out_unlock;
        iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
        return NULL;
 }
+/*
+ * Reset all profiling data associated with the specified node.
+ */
+static void reset_node(struct gcov_node *node)
+{
+        int i;
+        if (node->unloaded_info)
+                gcov_info_reset(node->unloaded_info);
+        for (i = 0; i < node->num_loaded; i++)
+                gcov_info_reset(node->loaded_info[i]);
+}
 static void remove_node(struct gcov_node *node);
 /*
 * write() implementation for gcov data files. Reset profiling data for the
- * associated file. If the object file has been unloaded (i.e. this is
+ * corresponding file. If all associated object files have been unloaded,
- * a "ghost" node), remove the debug fs node as well.
+ * remove the debug fs node as well.
 */
 static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
                              size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
        node = get_node_by_name(info->filename);
        if (node) {
                /* Reset counts or remove node for unloaded modules. */
-                if (node->ghost)
+                if (node->num_loaded == 0)
                        remove_node(node);
                else
-                        gcov_info_reset(node->info);
+                        reset_node(node);
        }
        /* Reset counts for open file. */
        gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
        INIT_LIST_HEAD(&node->list);
        INIT_LIST_HEAD(&node->children);
        INIT_LIST_HEAD(&node->all);
-        node->info = info;
+        if (node->loaded_info) {
+                node->loaded_info[0] = info;
+                node->num_loaded = 1;
+        }
        node->parent = parent;
        if (name)
                strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
        struct gcov_node *node;
        node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
-        if (!node) {
+        if (!node)
-                pr_warning("out of memory\n");
+                goto err_nomem;
-                return NULL;
+        if (info) {
+                node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
+                                           GFP_KERNEL);
+                if (!node->loaded_info)
+                        goto err_nomem;
        }
        init_node(node, info, name, parent);
        /* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
        list_add(&node->all, &all_head);
        return node;
+err_nomem:
+        kfree(node);
+        pr_warning("out of memory\n");
+        return NULL;
 }
 /* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
        list_del(&node->all);
        debugfs_remove(node->dentry);
        remove_links(node);
-        if (node->ghost)
+        kfree(node->loaded_info);
-                gcov_info_free(node->ghost);
+        if (node->unloaded_info)
+                gcov_info_free(node->unloaded_info);
        kfree(node);
 }
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
 /*
 * write() implementation for reset file. Reset all profiling data to zero
- * and remove ghost nodes.
+ * and remove nodes for which all associated object files are unloaded.
 */
 static ssize_t reset_write(struct file *file, const char __user *addr,
                           size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
        mutex_lock(&node_lock);
 restart:
        list_for_each_entry(node, &all_head, all) {
-                if (node->info)
+                if (node->num_loaded > 0)
-                        gcov_info_reset(node->info);
+                        reset_node(node);
                else if (list_empty(&node->children)) {
                        remove_node(node);
                        /* Several nodes may have gone - restart loop. */
@@ -511,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
 static const struct file_operations gcov_reset_fops = {
        .write  = reset_write,
        .read   = reset_read,
+        .llseek = noop_llseek,
 };
 /*
@@ -564,37 +615,115 @@ err_remove:
 }
 /*
- * The profiling data set associated with this node is being unloaded. Store a
+ * Associate a profiling data set with an existing node. Needs to be called
- * copy of the profiling data and turn this node into a "ghost".
+ * with node_lock held.
 */
-static int ghost_node(struct gcov_node *node)
+static void add_info(struct gcov_node *node, struct gcov_info *info)
 {
-        node->ghost = gcov_info_dup(node->info);
+        struct gcov_info **loaded_info;
-        if (!node->ghost) {
+        int num = node->num_loaded;
-                pr_warning("could not save data for '%s' (out of memory)\n",
-                           node->info->filename);
+        /*
-                return -ENOMEM;
+         * Prepare new array. This is done first to simplify cleanup in
+         * case the new data set is incompatible, the node only contains
+         * unloaded data sets and there's not enough memory for the array.
+         */
+        loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
+        if (!loaded_info) {
+                pr_warning("could not add '%s' (out of memory)\n",
+                           info->filename);
+                return;
+        }
+        memcpy(loaded_info, node->loaded_info,
+               num * sizeof(struct gcov_info *));
+        loaded_info[num] = info;
+        /* Check if the new data set is compatible. */
+        if (num == 0) {
+                /*
+                 * A module was unloaded, modified and reloaded. The new
+                 * data set replaces the copy of the last one.
+                 */
+                if (!gcov_info_is_compatible(node->unloaded_info, info)) {
+                        pr_warning("discarding saved data for %s "
+                                   "(incompatible version)\n", info->filename);
+                        gcov_info_free(node->unloaded_info);
+                        node->unloaded_info = NULL;
+                }
+        } else {
+                /*
+                 * Two different versions of the same object file are loaded.
+                 * The initial one takes precedence.
+                 */
+                if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
+                        pr_warning("could not add '%s' (incompatible "
+                                   "version)\n", info->filename);
+                        kfree(loaded_info);
+                        return;
+                }
        }
-        node->info = NULL;
+        /* Overwrite previous array. */
+        kfree(node->loaded_info);
+        node->loaded_info = loaded_info;
+        node->num_loaded = num + 1;
+}
-        return 0;
+/*
+ * Return the index of a profiling data set associated with a node.
+ */
+static int get_info_index(struct gcov_node *node, struct gcov_info *info)
+{
+        int i;
+        for (i = 0; i < node->num_loaded; i++) {
+                if (node->loaded_info[i] == info)
+                        return i;
+        }
+        return -ENOENT;
 }
 /*
- * Profiling data for this node has been loaded again. Add profiling data
+ * Save the data of a profiling data set which is being unloaded.
- * from previous instantiation and turn this node into a regular node.
 */
-static void revive_node(struct gcov_node *node, struct gcov_info *info)
+static void save_info(struct gcov_node *node, struct gcov_info *info)
 {
-        if (gcov_info_is_compatible(node->ghost, info))
+        if (node->unloaded_info)
-                gcov_info_add(info, node->ghost);
+                gcov_info_add(node->unloaded_info, info);
        else {
-                pr_warning("discarding saved data for '%s' (version changed)\n",
+                node->unloaded_info = gcov_info_dup(info);
+                if (!node->unloaded_info) {
+                        pr_warning("could not save data for '%s' "
+                                   "(out of memory)\n", info->filename);
+                }
+        }
+}
+/*
+ * Disassociate a profiling data set from a node. Needs to be called with
+ * node_lock held.
+ */
+static void remove_info(struct gcov_node *node, struct gcov_info *info)
+{
+        int i;
+        i = get_info_index(node, info);
+        if (i < 0) {
+                pr_warning("could not remove '%s' (not found)\n",
                           info->filename);
+                return;
        }
-        gcov_info_free(node->ghost);
+        if (gcov_persist)
-        node->ghost = NULL;
+                save_info(node, info);
-        node->info = info;
+        /* Shrink array. */
+        node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
+        node->num_loaded--;
+        if (node->num_loaded > 0)
+                return;
+        /* Last loaded data set was removed. */
+        kfree(node->loaded_info);
+        node->loaded_info = NULL;
+        node->num_loaded = 0;
+        if (!node->unloaded_info)
+                remove_node(node);
 }
 /*
@@ -609,30 +738,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
        node = get_node_by_name(info->filename);
        switch (action) {
        case GCOV_ADD:
-                /* Add new node or revive ghost. */
+                if (node)
-                if (!node) {
+                        add_info(node, info);
+                else
                        add_node(info);
-                        break;
-                }
-                if (gcov_persist)
-                        revive_node(node, info);
-                else {
-                        pr_warning("could not add '%s' (already exists)\n",
-                                   info->filename);
-                }
                break;
        case GCOV_REMOVE:
-                /* Remove node or turn into ghost. */
+                if (node)
-                if (!node) {
+                        remove_info(node, info);
+                else {
                        pr_warning("could not remove '%s' (not found)\n",
                                   info->filename);
-                        break;
                }
-                if (gcov_persist) {
-                        if (!ghost_node(node))
-                                break;
-                }
-                remove_node(node);
                break;
        }
        mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 53b1916c9492..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
        right = group_info->ngroups;
        while (left < right) {
                unsigned int mid = (left+right)/2;
-                int cmp = grp - GROUP_AT(group_info, mid);
+                if (grp > GROUP_AT(group_info, mid))
-                if (cmp > 0)
                        left = mid + 1;
-                else if (cmp < 0)
+                else if (grp < GROUP_AT(group_info, mid))
                        right = mid;
                else
                        return 1;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..72206cf5c6cf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
        do {
                seq = read_seqbegin(&xtime_lock);
                xts = __current_kernel_time();
-                tom = wall_to_monotonic;
+                tom = __get_wall_to_monotonic();
        } while (read_seqretry(&xtime_lock, seq));
        xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 static int hrtimer_get_target(int this_cpu, int pinned)
 {
 #ifdef CONFIG_NO_HZ
-        if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+        if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
-                int preferred_cpu = get_nohz_load_balancer();
+                return get_nohz_timer_target();
-                if (preferred_cpu >= 0)
-                        return preferred_cpu;
-        }
 #endif
        return this_cpu;
 }
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base;
-        struct timespec realtime_offset;
+        struct timespec realtime_offset, wtm;
        unsigned long seq;
        if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
        do {
                seq = read_seqbegin(&xtime_lock);
-                set_normalized_timespec(&realtime_offset,
+                wtm = __get_wall_to_monotonic();
-                                        -wall_to_monotonic.tv_sec,
-                                        -wall_to_monotonic.tv_nsec);
        } while (read_seqretry(&xtime_lock, seq));
+        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
        base = &__get_cpu_var(hrtimer_bases);
@@ -936,6 +931,7 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
        if (hrtimer_is_queued(timer)) {
+                unsigned long state;
                int reprogram;
                /*
@@ -949,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
                debug_deactivate(timer);
                timer_stats_hrtimer_clear_start_info(timer);
                reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-                __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
+                /*
-                                 reprogram);
+                 * We must preserve the CALLBACK state flag here,
+                 * otherwise we could move the timer base in
+                 * switch_hrtimer_base.
+                 */
+                state = timer->state & HRTIMER_STATE_CALLBACK;
+                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
@@ -1096,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
 */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
-        struct hrtimer_clock_base *base;
        unsigned long flags;
        ktime_t rem;
-        base = lock_hrtimer_base(timer, &flags);
+        lock_hrtimer_base(timer, &flags);
        rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);
@@ -1237,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
                BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
                enqueue_hrtimer(timer, base);
        }
+        WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
        timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                        " disables this message.\n");
        sched_show_task(t);
-        __debug_show_held_locks(t);
+        debug_show_held_locks(t);
        touch_nmi_watchdog();
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 * periodically exit the critical section and enter a new one.
 *
 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
 */
 static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
 {
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..2c9120f0afca 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 static int nr_slots[TYPE_MAX];
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
 static int constraints_initialized;
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
        return 0;
 }
-static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-        struct perf_event_context *ctx = tsk->perf_event_ctxp;
+        struct task_struct *tsk = bp->hw.bp_target;
-        struct list_head *list;
+        struct perf_event *iter;
-        struct perf_event *bp;
-        unsigned long flags;
        int count = 0;
-        if (WARN_ONCE(!ctx, "No perf context for this task"))
+        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                return 0;
+                if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
+                        count += hw_breakpoint_weight(iter);
-        list = &ctx->event_list;
-        raw_spin_lock_irqsave(&ctx->lock, flags);
-        /*
-         * The current breakpoint counter is not included in the list
-         * at the open() callback time
-         */
-        list_for_each_entry(bp, list, event_entry) {
-                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-                        if (find_slot_idx(bp) == type)
-                                count += hw_breakpoint_weight(bp);
        }
-        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return count;
 }
@@ -142,14 +134,14 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                    enum bp_type_idx type)
 {
        int cpu = bp->cpu;
-        struct task_struct *tsk = bp->ctx->task;
+        struct task_struct *tsk = bp->hw.bp_target;
        if (cpu >= 0) {
                slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
                if (!tsk)
                        slots->pinned += max_task_bp_pinned(cpu, type);
                else
-                        slots->pinned += task_bp_pinned(tsk, type);
+                        slots->pinned += task_bp_pinned(bp, type);
                slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
                return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (!tsk)
                        nr += max_task_bp_pinned(cpu, type);
                else
-                        nr += task_bp_pinned(tsk, type);
+                        nr += task_bp_pinned(bp, type);
                if (nr > slots->pinned)
                        slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
 /*
 * Add a pinned breakpoint for the given task in our constraint table
 */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
                                enum bp_type_idx type, int weight)
 {
        unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
        int old_idx = 0;
        int idx = 0;
-        old_count = task_bp_pinned(tsk, type);
+        old_count = task_bp_pinned(bp, type);
        old_idx = old_count - 1;
        idx = old_idx + weight;
+        /* tsk_pinned[n] is the number of tasks having n breakpoints */
        tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
        if (enable) {
                tsk_pinned[idx]++;
@@ -220,25 +213,43 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
               int weight)
 {
        int cpu = bp->cpu;
-        struct task_struct *tsk = bp->ctx->task;
+        struct task_struct *tsk = bp->hw.bp_target;
+        /* Pinned counter cpu profiling */
+        if (!tsk) {
+                if (enable)
+                        per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+                else
+                        per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+                return;
+        }
        /* Pinned counter task profiling */
-        if (tsk) {
-                if (cpu >= 0) {
-                        toggle_bp_task_slot(tsk, cpu, enable, type, weight);
-                        return;
-                }
+        if (!enable)
+                list_del(&bp->hw.bp_list);
+        if (cpu >= 0) {
+                toggle_bp_task_slot(bp, cpu, enable, type, weight);
+        } else {
                for_each_online_cpu(cpu)
-                        toggle_bp_task_slot(tsk, cpu, enable, type, weight);
+                        toggle_bp_task_slot(bp, cpu, enable, type, weight);
-                return;
        }
-        /* Pinned counter cpu profiling */
        if (enable)
-                per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+                list_add_tail(&bp->hw.bp_list, &bp_task_head);
-        else
+}
-                per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+/*
+ * Function to perform processor-specific cleanup during unregistration
+ */
+__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+        /*
+         * A weak stub function here for those archs that don't define
+         * it inside arch/.../kernel/hw_breakpoint.c
+         */
 }
 /*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
        weight = hw_breakpoint_weight(bp);
        fetch_bp_busy_slots(&slots, bp, type);
+        /*
+         * Simulate the addition of this breakpoint to the constraints
+         * and see the result.
+         */
        fetch_this_slot(&slots, weight);
        /* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
 {
        mutex_lock(&nr_bp_mutex);
+        arch_unregister_hw_breakpoint(bp);
        __release_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
@@ -417,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
                            struct task_struct *tsk)
 {
-        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+        return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -499,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-                bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+                bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
                *pevent = bp;
@@ -549,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
        .priority = 0x7fffffff
 };
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+        release_bp_slot(event);
+}
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+        int err;
+        if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+                return -ENOENT;
+        err = register_perf_hw_breakpoint(bp);
+        if (err)
+                return err;
+        bp->destroy = bp_perf_event_destroy;
+        return 0;
+}
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+        if (!(flags & PERF_EF_START))
+                bp->hw.state = PERF_HES_STOPPED;
+        return arch_install_hw_breakpoint(bp);
+}
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+        arch_uninstall_hw_breakpoint(bp);
+}
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+        bp->hw.state = 0;
+}
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+        bp->hw.state = PERF_HES_STOPPED;
+}
+static struct pmu perf_breakpoint = {
+        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
+        .event_init     = hw_breakpoint_event_init,
+        .add            = hw_breakpoint_add,
+        .del            = hw_breakpoint_del,
+        .start          = hw_breakpoint_start,
+        .stop           = hw_breakpoint_stop,
+        .read           = hw_breakpoint_pmu_read,
+};
 static int __init init_hw_breakpoint(void)
 {
        unsigned int **task_bp_pinned;
@@ -570,6 +641,8 @@ static int __init init_hw_breakpoint(void)
        constraints_initialized = 1;
+        perf_pmu_register(&perf_breakpoint);
        return register_die_notifier(&hw_breakpoint_exceptions_nb);
 err_alloc:
@@ -585,8 +658,3 @@ static int __init init_hw_breakpoint(void)
 core_initcall(init_hw_breakpoint);
-struct pmu perf_ops_bp = {
-        .enable         = arch_install_hw_breakpoint,
-        .disable        = arch_uninstall_hw_breakpoint,
-        .read           = hw_breakpoint_pmu_read,
-};
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..31d766bf5d2e
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
+config HAVE_GENERIC_HARDIRQS
+        def_bool n
+if HAVE_GENERIC_HARDIRQS
+menu "IRQ subsystem"
+#
+# Interrupt subsystem related configuration options
+#
+config GENERIC_HARDIRQS
+       def_bool y
+config GENERIC_HARDIRQS_NO__DO_IRQ
+       def_bool y
+# Select this to disable the deprecated stuff
+config GENERIC_HARDIRQS_NO_DEPRECATED
+       def_bool n
+# Options selectable by the architecture code
+config HAVE_SPARSE_IRQ
+       def_bool n
+config GENERIC_IRQ_PROBE
+        def_bool n
+config GENERIC_PENDING_IRQ
+        def_bool n
+config AUTO_IRQ_AFFINITY
+       def_bool n
+config IRQ_PER_CPU
+       def_bool n
+config HARDIRQS_SW_RESEND
+       def_bool n
+config SPARSE_IRQ
+        bool "Support sparse irq numbering"
+        depends on HAVE_SPARSE_IRQ
+        ---help---
+          Sparse irq numbering is useful for distro kernels that want
+          to define a high CONFIG_NR_CPUS value but still want to have
+          low kernel memory footprint on smaller machines.
+          ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
+            out the interrupt descriptors in a more NUMA-friendly way. )
+          If you don't know what to do here, say N.
+endmenu
+endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..54329cd7b3ee 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
-obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..505798f86c36 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
                         * Some chips need to know about probing in
                         * progress:
                         */
-                        if (desc->chip->set_type)
+                        if (desc->irq_data.chip->irq_set_type)
-                                desc->chip->set_type(i, IRQ_TYPE_PROBE);
+                                desc->irq_data.chip->irq_set_type(&desc->irq_data,
-                        desc->chip->startup(i);
+                                                         IRQ_TYPE_PROBE);
+                        desc->irq_data.chip->irq_startup(&desc->irq_data);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-                        if (desc->chip->startup(i))
+                        if (desc->irq_data.chip->irq_startup(&desc->irq_data))
                                desc->status |= IRQ_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
                        /* It triggered already - consider it spurious. */
                        if (!(status & IRQ_WAITING)) {
                                desc->status = status & ~IRQ_AUTODETECT;
-                                desc->chip->shutdown(i);
+                                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                        } else
                                if (i < 32)
                                        mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
                                mask |= 1 << i;
                        desc->status = status & ~IRQ_AUTODETECT;
-                        desc->chip->shutdown(i);
+                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
                                nr_of_irqs++;
                        }
                        desc->status = status & ~IRQ_AUTODETECT;
-                        desc->chip->shutdown(i);
+                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..baa5c4acad83 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
 #include "internals.h"
-static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
-{
-        struct irq_desc *desc;
-        unsigned long flags;
-        desc = irq_to_desc(irq);
-        if (!desc) {
-                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-                return;
-        }
-        /* Ensure we don't have left over values from a previous use of this irq */
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status = IRQ_DISABLED;
-        desc->chip = &no_irq_chip;
-        desc->handle_irq = handle_bad_irq;
-        desc->depth = 1;
-        desc->msi_desc = NULL;
-        desc->handler_data = NULL;
-        if (!keep_chip_data)
-                desc->chip_data = NULL;
-        desc->action = NULL;
-        desc->irq_count = 0;
-        desc->irqs_unhandled = 0;
-#ifdef CONFIG_SMP
-        cpumask_setall(desc->affinity);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-        cpumask_clear(desc->pending_mask);
-#endif
-#endif
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-/**
- *      dynamic_irq_init - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
-{
-        dynamic_irq_init_x(irq, false);
-}
-/**
- *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- *
- *      does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_init_keep_chip_data(unsigned int irq)
-{
-        dynamic_irq_init_x(irq, true);
-}
-static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc) {
-                WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-                return;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        if (desc->action) {
-                raw_spin_unlock_irqrestore(&desc->lock, flags);
-                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
-                        irq);
-                return;
-        }
-        desc->msi_desc = NULL;
-        desc->handler_data = NULL;
-        if (!keep_chip_data)
-                desc->chip_data = NULL;
-        desc->handle_irq = handle_bad_irq;
-        desc->chip = &no_irq_chip;
-        desc->name = NULL;
-        clear_kstat_irqs(desc);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-/**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
-{
-        dynamic_irq_cleanup_x(irq, false);
-}
-/**
- *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
- *      @irq:   irq number to initialize
- *
- *      does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
-{
-        dynamic_irq_cleanup_x(irq, true);
-}
 /**
 *      set_irq_chip - set the irq chip for an irq
 *      @irq:   irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        raw_spin_lock_irqsave(&desc->lock, flags);
        irq_chip_set_defaults(chip);
-        desc->chip = chip;
+        desc->irq_data.chip = chip;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->handler_data = data;
+        desc->irq_data.handler_data = data;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->msi_desc = entry;
+        desc->irq_data.msi_desc = entry;
        if (entry)
                entry->irq = irq;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
                return -EINVAL;
        }
-        if (!desc->chip) {
+        if (!desc->irq_data.chip) {
                printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
                return -EINVAL;
        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->chip_data = data;
+        desc->irq_data.chip_data = data;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
 EXPORT_SYMBOL(set_irq_chip_data);
+struct irq_data *irq_get_irq_data(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        return desc ? &desc->irq_data : NULL;
+}
+EXPORT_SYMBOL_GPL(irq_get_irq_data);
 /**
 *      set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
 *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
 /*
 * default enable function
 */
-static void default_enable(unsigned int irq)
+static void default_enable(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(data);
-        desc->chip->unmask(irq);
+        desc->irq_data.chip->irq_unmask(&desc->irq_data);
        desc->status &= ~IRQ_MASKED;
 }
 /*
 * default disable function
 */
-static void default_disable(unsigned int irq)
+static void default_disable(struct irq_data *data)
 {
 }
 /*
 * default startup function
 */
-static unsigned int default_startup(unsigned int irq)
+static unsigned int default_startup(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(data);
-        desc->chip->enable(irq);
+        desc->irq_data.chip->irq_enable(data);
        return 0;
 }
 /*
 * default shutdown function
 */
-static void default_shutdown(unsigned int irq)
+static void default_shutdown(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(data);
-        desc->chip->mask(irq);
+        desc->irq_data.chip->irq_mask(&desc->irq_data);
        desc->status |= IRQ_MASKED;
 }
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+/* Temporary migration helpers */
+static void compat_irq_mask(struct irq_data *data)
+{
+        data->chip->mask(data->irq);
+}
+static void compat_irq_unmask(struct irq_data *data)
+{
+        data->chip->unmask(data->irq);
+}
+static void compat_irq_ack(struct irq_data *data)
+{
+        data->chip->ack(data->irq);
+}
+static void compat_irq_mask_ack(struct irq_data *data)
+{
+        data->chip->mask_ack(data->irq);
+}
+static void compat_irq_eoi(struct irq_data *data)
+{
+        data->chip->eoi(data->irq);
+}
+static void compat_irq_enable(struct irq_data *data)
+{
+        data->chip->enable(data->irq);
+}
+static void compat_irq_disable(struct irq_data *data)
+{
+        data->chip->disable(data->irq);
+}
+static void compat_irq_shutdown(struct irq_data *data)
+{
+        data->chip->shutdown(data->irq);
+}
+static unsigned int compat_irq_startup(struct irq_data *data)
+{
+        return data->chip->startup(data->irq);
+}
+static int compat_irq_set_affinity(struct irq_data *data,
+                                   const struct cpumask *dest, bool force)
+{
+        return data->chip->set_affinity(data->irq, dest);
+}
+static int compat_irq_set_type(struct irq_data *data, unsigned int type)
+{
+        return data->chip->set_type(data->irq, type);
+}
+static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+        return data->chip->set_wake(data->irq, on);
+}
+static int compat_irq_retrigger(struct irq_data *data)
+{
+        return data->chip->retrigger(data->irq);
+}
+static void compat_bus_lock(struct irq_data *data)
+{
+        data->chip->bus_lock(data->irq);
+}
+static void compat_bus_sync_unlock(struct irq_data *data)
+{
+        data->chip->bus_sync_unlock(data->irq);
+}
+#endif
 /*
 * Fixup enable/disable function pointers
 */
 void irq_chip_set_defaults(struct irq_chip *chip)
 {
-        if (!chip->enable)
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-                chip->enable = default_enable;
-        if (!chip->disable)
-                chip->disable = default_disable;
-        if (!chip->startup)
-                chip->startup = default_startup;
        /*
-         * We use chip->disable, when the user provided its own. When
+         * Compat fixup functions need to be before we set the
-         * we have default_disable set for chip->disable, then we need
+         * defaults for enable/disable/startup/shutdown
+         */
+        if (chip->enable)
+                chip->irq_enable = compat_irq_enable;
+        if (chip->disable)
+                chip->irq_disable = compat_irq_disable;
+        if (chip->shutdown)
+                chip->irq_shutdown = compat_irq_shutdown;
+        if (chip->startup)
+                chip->irq_startup = compat_irq_startup;
+#endif
+        /*
+         * The real defaults
+         */
+        if (!chip->irq_enable)
+                chip->irq_enable = default_enable;
+        if (!chip->irq_disable)
+                chip->irq_disable = default_disable;
+        if (!chip->irq_startup)
+                chip->irq_startup = default_startup;
+        /*
+         * We use chip->irq_disable, when the user provided its own. When
+         * we have default_disable set for chip->irq_disable, then we need
         * to use default_shutdown, otherwise the irq line is not
         * disabled on free_irq():
         */
-        if (!chip->shutdown)
+        if (!chip->irq_shutdown)
-                chip->shutdown = chip->disable != default_disable ?
+                chip->irq_shutdown = chip->irq_disable != default_disable ?
-                        chip->disable : default_shutdown;
+                        chip->irq_disable : default_shutdown;
-        if (!chip->name)
-                chip->name = chip->typename;
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
        if (!chip->end)
                chip->end = dummy_irq_chip.end;
+        /*
+         * Now fix up the remaining compat handlers
+         */
+        if (chip->bus_lock)
+                chip->irq_bus_lock = compat_bus_lock;
+        if (chip->bus_sync_unlock)
+                chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
+        if (chip->mask)
+                chip->irq_mask = compat_irq_mask;
+        if (chip->unmask)
+                chip->irq_unmask = compat_irq_unmask;
+        if (chip->ack)
+                chip->irq_ack = compat_irq_ack;
+        if (chip->mask_ack)
+                chip->irq_mask_ack = compat_irq_mask_ack;
+        if (chip->eoi)
+                chip->irq_eoi = compat_irq_eoi;
+        if (chip->set_affinity)
+                chip->irq_set_affinity = compat_irq_set_affinity;
+        if (chip->set_type)
+                chip->irq_set_type = compat_irq_set_type;
+        if (chip->set_wake)
+                chip->irq_set_wake = compat_irq_set_wake;
+        if (chip->retrigger)
+                chip->irq_retrigger = compat_irq_retrigger;
+#endif
 }
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc *desc)
 {
-        if (desc->chip->mask_ack)
+        if (desc->irq_data.chip->irq_mask_ack)
-                desc->chip->mask_ack(irq);
+                desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
        else {
-                desc->chip->mask(irq);
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
-                if (desc->chip->ack)
+                if (desc->irq_data.chip->irq_ack)
-                        desc->chip->ack(irq);
+                        desc->irq_data.chip->irq_ack(&desc->irq_data);
        }
        desc->status |= IRQ_MASKED;
 }
-static inline void mask_irq(struct irq_desc *desc, int irq)
+static inline void mask_irq(struct irq_desc *desc)
 {
-        if (desc->chip->mask) {
+        if (desc->irq_data.chip->irq_mask) {
-                desc->chip->mask(irq);
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
                desc->status |= IRQ_MASKED;
        }
 }
-static inline void unmask_irq(struct irq_desc *desc, int irq)
+static inline void unmask_irq(struct irq_desc *desc)
 {
-        if (desc->chip->unmask) {
+        if (desc->irq_data.chip->irq_unmask) {
-                desc->chip->unmask(irq);
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
                desc->status &= ~IRQ_MASKED;
        }
 }
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        mask_ack_irq(desc, irq);
+        mask_ack_irq(desc);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-                unmask_irq(desc, irq);
+                unmask_irq(desc);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
-                mask_irq(desc, irq);
+                mask_irq(desc);
                goto out;
        }
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out:
-        desc->chip->eoi(irq);
+        desc->irq_data.chip->irq_eoi(&desc->irq_data);
        raw_spin_unlock(&desc->lock);
 }
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
        if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
-                mask_ack_irq(desc, irq);
+                mask_ack_irq(desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
        /* Start handling the irq */
-        if (desc->chip->ack)
+        desc->irq_data.chip->irq_ack(&desc->irq_data);
-                desc->chip->ack(irq);
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                irqreturn_t action_ret;
                if (unlikely(!action)) {
-                        mask_irq(desc, irq);
+                        mask_irq(desc);
                        goto out_unlock;
                }
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                if (unlikely((desc->status &
                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                              (IRQ_PENDING | IRQ_MASKED))) {
-                        unmask_irq(desc, irq);
+                        unmask_irq(desc);
                }
                desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (desc->chip->ack)
+        if (desc->irq_data.chip->irq_ack)
-                desc->chip->ack(irq);
+                desc->irq_data.chip->irq_ack(&desc->irq_data);
        action_ret = handle_IRQ_event(irq, desc->action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        if (desc->chip->eoi)
+        if (desc->irq_data.chip->irq_eoi)
-                desc->chip->eoi(irq);
+                desc->irq_data.chip->irq_eoi(&desc->irq_data);
 }
 void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        if (!handle)
                handle = handle_bad_irq;
-        else if (desc->chip == &no_irq_chip) {
+        else if (desc->irq_data.chip == &no_irq_chip) {
                printk(KERN_WARNING "Trying to install %sinterrupt handler "
                       "for IRQ%d\n", is_chained ? "chained " : "", irq);
                /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                 * prevent us to setup the interrupt at all. Switch it to
                 * dummy_irq_chip for easy transition.
                 */
-                desc->chip = &dummy_irq_chip;
+                desc->irq_data.chip = &dummy_irq_chip;
        }
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-                if (desc->chip != &no_irq_chip)
+                if (desc->irq_data.chip != &no_irq_chip)
-                        mask_ack_irq(desc, irq);
+                        mask_ack_irq(desc);
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                desc->status &= ~IRQ_DISABLED;
                desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
                desc->depth = 0;
-                desc->chip->startup(irq);
+                desc->irq_data.chip->irq_startup(&desc->irq_data);
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        __set_irq_handler(irq, handle, 0, name);
 }
-void set_irq_noprobe(unsigned int irq)
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
                return;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status |= IRQ_NOPROBE;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-void set_irq_probe(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc) {
+        /* Sanitize flags */
-                printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+        set &= IRQF_MODIFY_MASK;
-                return;
+        clr &= IRQF_MODIFY_MASK;
-        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status &= ~IRQ_NOPROBE;
+        desc->status &= ~clr;
+        desc->status |= set;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..20dc5474947e
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the dummy interrupt chip implementation
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include "internals.h"
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(struct irq_data *data)
+{
+        struct irq_desc *desc = irq_data_to_desc(data);
+        print_irq_desc(data->irq, desc);
+        ack_bad_irq(data->irq);
+}
+/*
+ * NOP functions
+ */
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data)
+{
+        return 0;
+}
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static void compat_noop(unsigned int irq) { }
+#define END_INIT .end = compat_noop
+#else
+#define END_INIT
+#endif
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+        .name           = "none",
+        .irq_startup    = noop_ret,
+        .irq_shutdown   = noop,
+        .irq_enable     = noop,
+        .irq_disable    = noop,
+        .irq_ack        = ack_bad,
+        END_INIT
+};
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+        .name           = "dummy",
+        .irq_startup    = noop_ret,
+        .irq_shutdown   = noop,
+        .irq_enable     = noop,
+        .irq_disable    = noop,
+        .irq_ack        = noop,
+        .irq_mask       = noop,
+        .irq_unmask     = noop,
+        END_INIT
+};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..e2347eb63306 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
 */
 #include <linux/irq.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
-#include <linux/rculist.h>
-#include <linux/hash.h>
-#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-struct lock_class_key irq_desc_lock_class;
 /**
 * handle_bad_irq - handle spurious and unhandled irqs
 * @irq:       the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
        ack_bad_irq(irq);
 }
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-static void __init init_irq_default_affinity(void)
-{
-        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-        cpumask_setall(irq_default_affinity);
-}
-#else
-static void __init init_irq_default_affinity(void)
-{
-}
-#endif
-/*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the appropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
- */
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
-#ifdef CONFIG_SPARSE_IRQ
-static struct irq_desc irq_desc_init = {
-        .irq        = -1,
-        .status     = IRQ_DISABLED,
-        .chip       = &no_irq_chip,
-        .handle_irq = handle_bad_irq,
-        .depth      = 1,
-        .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-};
-void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
-{
-        void *ptr;
-        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                           GFP_ATOMIC, node);
-        /*
-         * don't overwite if can not get new one
-         * init_copy_kstat_irqs() could still use old one
-         */
-        if (ptr) {
-                printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
-                desc->kstat_irqs = ptr;
-        }
-}
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
-{
-        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-        raw_spin_lock_init(&desc->lock);
-        desc->irq = irq;
-#ifdef CONFIG_SMP
-        desc->node = node;
-#endif
-        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_kstat_irqs(desc, node, nr_cpu_ids);
-        if (!desc->kstat_irqs) {
-                printk(KERN_ERR "can not alloc kstat_irqs\n");
-                BUG_ON(1);
-        }
-        if (!alloc_desc_masks(desc, node, false)) {
-                printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-                BUG_ON(1);
-        }
-        init_desc_masks(desc);
-        arch_init_chip_data(desc, node);
-}
-/*
- * Protect the sparse_irqs:
- */
-DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
-static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-        radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        return radix_tree_lookup(&irq_desc_tree, irq);
-}
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-        void **ptr;
-        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
-        if (ptr)
-                radix_tree_replace_slot(ptr, desc);
-}
-static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
-        [0 ... NR_IRQS_LEGACY-1] = {
-                .irq        = -1,
-                .status     = IRQ_DISABLED,
-                .chip       = &no_irq_chip,
-                .handle_irq = handle_bad_irq,
-                .depth      = 1,
-                .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-        }
-};
-static unsigned int *kstat_irqs_legacy;
-int __init early_irq_init(void)
-{
-        struct irq_desc *desc;
-        int legacy_count;
-        int node;
-        int i;
-        init_irq_default_affinity();
-         /* initialize nr_irqs based on nr_cpu_ids */
-        arch_probe_nr_irqs();
-        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
-        desc = irq_desc_legacy;
-        legacy_count = ARRAY_SIZE(irq_desc_legacy);
-        node = first_online_node;
-        /* allocate based on nr_cpu_ids */
-        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-                                          sizeof(int), GFP_NOWAIT, node);
-        for (i = 0; i < legacy_count; i++) {
-                desc[i].irq = i;
-#ifdef CONFIG_SMP
-                desc[i].node = node;
-#endif
-                desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
-                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                alloc_desc_masks(&desc[i], node, true);
-                init_desc_masks(&desc[i]);
-                set_irq_desc(i, &desc[i]);
-        }
-        return arch_early_irq_init();
-}
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        struct irq_desc *desc;
-        unsigned long flags;
-        if (irq >= nr_irqs) {
-                WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
-                        irq, nr_irqs);
-                return NULL;
-        }
-        desc = irq_to_desc(irq);
-        if (desc)
-                return desc;
-        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-        /* We have to check it to avoid races with another CPU */
-        desc = irq_to_desc(irq);
-        if (desc)
-                goto out_unlock;
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
-        if (!desc) {
-                printk(KERN_ERR "can not alloc irq_desc\n");
-                BUG_ON(1);
-        }
-        init_one_irq_desc(irq, desc, node);
-        set_irq_desc(irq, desc);
-out_unlock:
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        return desc;
-}
-#else /* !CONFIG_SPARSE_IRQ */
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
-        [0 ... NR_IRQS-1] = {
-                .status = IRQ_DISABLED,
-                .chip = &no_irq_chip,
-                .handle_irq = handle_bad_irq,
-                .depth = 1,
-                .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
-        }
-};
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
-int __init early_irq_init(void)
-{
-        struct irq_desc *desc;
-        int count;
-        int i;
-        init_irq_default_affinity();
-        printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
-        desc = irq_desc;
-        count = ARRAY_SIZE(irq_desc);
-        for (i = 0; i < count; i++) {
-                desc[i].irq = i;
-                alloc_desc_masks(&desc[i], 0, true);
-                init_desc_masks(&desc[i]);
-                desc[i].kstat_irqs = kstat_irqs_all[i];
-        }
-        return arch_early_irq_init();
-}
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        return irq_to_desc(irq);
-}
-#endif /* !CONFIG_SPARSE_IRQ */
-void clear_kstat_irqs(struct irq_desc *desc)
-{
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
-}
-/*
- * What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
- */
-static void ack_bad(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        print_irq_desc(irq, desc);
-        ack_bad_irq(irq);
-}
-/*
- * NOP functions
- */
-static void noop(unsigned int irq)
-{
-}
-static unsigned int noop_ret(unsigned int irq)
-{
-        return 0;
-}
-/*
- * Generic no controller implementation
- */
-struct irq_chip no_irq_chip = {
-        .name           = "none",
-        .startup        = noop_ret,
-        .shutdown       = noop,
-        .enable         = noop,
-        .disable        = noop,
-        .ack            = ack_bad,
-        .end            = noop,
-};
-/*
- * Generic dummy implementation which can be used for
- * real dumb interrupt sources
- */
-struct irq_chip dummy_irq_chip = {
-        .name           = "dummy",
-        .startup        = noop_ret,
-        .shutdown       = noop,
-        .enable         = noop,
-        .disable        = noop,
-        .ack            = noop,
-        .mask           = noop,
-        .unmask         = noop,
-        .end            = noop,
-};
 /*
 * Special, empty irq handler:
 */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-                if (desc->chip->ack)
+                if (desc->irq_data.chip->ack)
-                        desc->chip->ack(irq);
+                        desc->irq_data.chip->ack(irq);
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
                                note_interrupt(irq, desc, action_ret);
                }
-                desc->chip->end(irq);
+                desc->irq_data.chip->end(irq);
                return 1;
        }
        raw_spin_lock(&desc->lock);
-        if (desc->chip->ack)
+        if (desc->irq_data.chip->ack)
-                desc->chip->ack(irq);
+                desc->irq_data.chip->ack(irq);
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
         * The ->end() handler has to deal with interrupts which got
         * disabled while the handler was running.
         */
-        desc->chip->end(irq);
+        desc->irq_data.chip->end(irq);
        raw_spin_unlock(&desc->lock);
        return 1;
 }
 #endif
-void early_init_irq_lock_class(void)
-{
-        struct irq_desc *desc;
-        int i;
-        for_each_irq_desc(i, desc) {
-                lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        }
-}
-unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        return desc ? desc->kstat_irqs[cpu] : 0;
-}
-EXPORT_SYMBOL(kstat_irqs_cpu);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..4571ae7e085a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
 /*
 * IRQ subsystem internal functions and variables:
 */
+#include <linux/irqdesc.h>
 extern int noirqdebug;
+#define irq_data_to_desc(data)  container_of(data, struct irq_desc, irq_data)
 /* Set default functions for irq_chip structures: */
 extern void irq_chip_set_defaults(struct irq_chip *chip);
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-extern void clear_kstat_irqs(struct irq_desc *desc);
-extern raw_spinlock_t sparse_irq_lock;
-#ifdef CONFIG_SPARSE_IRQ
+/* Resending of interrupts :*/
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
-#endif
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
 extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
 #else
 static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
 static inline void register_handler_proc(unsigned int irq,
                                         struct irqaction *action) { }
 static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static inline void irq_end(unsigned int irq, struct irq_desc *desc)
+{
+        if (desc->irq_data.chip && desc->irq_data.chip->end)
+                desc->irq_data.chip->end(irq);
+}
+#else
+static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
+#endif
 /* Inline functions for support of irq chips on slow busses */
-static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_lock(struct irq_desc *desc)
 {
-        if (unlikely(desc->chip->bus_lock))
+        if (unlikely(desc->irq_data.chip->irq_bus_lock))
-                desc->chip->bus_lock(irq);
+                desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
 }
-static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 {
-        if (unlikely(desc->chip->bus_sync_unlock))
+        if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
-                desc->chip->bus_sync_unlock(irq);
+                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 /*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
        printk("->handle_irq():  %p, ", desc->handle_irq);
        print_symbol("%s\n", (unsigned long)desc->handle_irq);
-        printk("->chip(): %p, ", desc->chip);
+        printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
-        print_symbol("%s\n", (unsigned long)desc->chip);
+        print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
        printk("->action(): %p\n", desc->action);
        if (desc->action) {
                printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..9988d03797f5
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the interrupt descriptor management code
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/radix-tree.h>
+#include <linux/bitmap.h>
+#include "internals.h"
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+        cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+        if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+                return -ENOMEM;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+                free_cpumask_var(desc->irq_data.affinity);
+                return -ENOMEM;
+        }
+#endif
+        return 0;
+}
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+        desc->irq_data.node = node;
+        cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        cpumask_clear(desc->pending_mask);
+#endif
+}
+static inline int desc_node(struct irq_desc *desc)
+{
+        return desc->irq_data.node;
+}
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline int desc_node(struct irq_desc *desc) { return 0; }
+#endif
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+{
+        desc->irq_data.irq = irq;
+        desc->irq_data.chip = &no_irq_chip;
+        desc->irq_data.chip_data = NULL;
+        desc->irq_data.handler_data = NULL;
+        desc->irq_data.msi_desc = NULL;
+        desc->status = IRQ_DEFAULT_INIT_FLAGS;
+        desc->handle_irq = handle_bad_irq;
+        desc->depth = 1;
+        desc->irq_count = 0;
+        desc->irqs_unhandled = 0;
+        desc->name = NULL;
+        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+        desc_smp_init(desc, node);
+}
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+static DEFINE_MUTEX(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+#ifdef CONFIG_SPARSE_IRQ
+static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+static void delete_irq_desc(unsigned int irq)
+{
+        radix_tree_delete(&irq_desc_tree, irq);
+}
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        free_cpumask_var(desc->pending_mask);
+#endif
+        free_cpumask_var(desc->irq_data.affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+static struct irq_desc *alloc_desc(int irq, int node)
+{
+        struct irq_desc *desc;
+        gfp_t gfp = GFP_KERNEL;
+        desc = kzalloc_node(sizeof(*desc), gfp, node);
+        if (!desc)
+                return NULL;
+        /* allocate based on nr_cpu_ids */
+        desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+                                         gfp, node);
+        if (!desc->kstat_irqs)
+                goto err_desc;
+        if (alloc_masks(desc, gfp, node))
+                goto err_kstat;
+        raw_spin_lock_init(&desc->lock);
+        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+        desc_set_defaults(irq, desc, node);
+        return desc;
+err_kstat:
+        kfree(desc->kstat_irqs);
+err_desc:
+        kfree(desc);
+        return NULL;
+}
+static void free_desc(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unregister_irq_proc(irq, desc);
+        mutex_lock(&sparse_irq_lock);
+        delete_irq_desc(irq);
+        mutex_unlock(&sparse_irq_lock);
+        free_masks(desc);
+        kfree(desc->kstat_irqs);
+        kfree(desc);
+}
+static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+        struct irq_desc *desc;
+        int i;
+        for (i = 0; i < cnt; i++) {
+                desc = alloc_desc(start + i, node);
+                if (!desc)
+                        goto err;
+                mutex_lock(&sparse_irq_lock);
+                irq_insert_desc(start + i, desc);
+                mutex_unlock(&sparse_irq_lock);
+        }
+        return start;
+err:
+        for (i--; i >= 0; i--)
+                free_desc(start + i);
+        mutex_lock(&sparse_irq_lock);
+        bitmap_clear(allocated_irqs, start, cnt);
+        mutex_unlock(&sparse_irq_lock);
+        return -ENOMEM;
+}
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+        int res = irq_alloc_descs(irq, irq, 1, node);
+        if (res == -EEXIST || res == irq)
+                return irq_to_desc(irq);
+        return NULL;
+}
+int __init early_irq_init(void)
+{
+        int i, initcnt, node = first_online_node;
+        struct irq_desc *desc;
+        init_irq_default_affinity();
+        /* Let arch update nr_irqs and return the nr of preallocated irqs */
+        initcnt = arch_probe_nr_irqs();
+        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+        for (i = 0; i < initcnt; i++) {
+                desc = alloc_desc(i, node);
+                set_bit(i, allocated_irqs);
+                irq_insert_desc(i, desc);
+        }
+        return arch_early_irq_init();
+}
+#else /* !CONFIG_SPARSE_IRQ */
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+        [0 ... NR_IRQS-1] = {
+                .status         = IRQ_DEFAULT_INIT_FLAGS,
+                .handle_irq     = handle_bad_irq,
+                .depth          = 1,
+                .lock           = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
+        }
+};
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
+int __init early_irq_init(void)
+{
+        int count, i, node = first_online_node;
+        struct irq_desc *desc;
+        init_irq_default_affinity();
+        printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+        desc = irq_desc;
+        count = ARRAY_SIZE(irq_desc);
+        for (i = 0; i < count; i++) {
+                desc[i].irq_data.irq = i;
+                desc[i].irq_data.chip = &no_irq_chip;
+                desc[i].kstat_irqs = kstat_irqs_all[i];
+                alloc_masks(desc + i, GFP_KERNEL, node);
+                desc_smp_init(desc + i, node);
+                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+        }
+        return arch_early_irq_init();
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+        return irq_to_desc(irq);
+}
+static void free_desc(unsigned int irq)
+{
+        dynamic_irq_cleanup(irq);
+}
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+        return start;
+}
+#endif /* !CONFIG_SPARSE_IRQ */
+/* Dynamic interrupt handling */
+/**
+ * irq_free_descs - free irq descriptors
+ * @from:       Start of descriptor range
+ * @cnt:        Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+        int i;
+        if (from >= nr_irqs || (from + cnt) > nr_irqs)
+                return;
+        for (i = 0; i < cnt; i++)
+                free_desc(from + i);
+        mutex_lock(&sparse_irq_lock);
+        bitmap_clear(allocated_irqs, from, cnt);
+        mutex_unlock(&sparse_irq_lock);
+}
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq:        Allocate for specific irq number if irq >= 0
+ * @from:       Start the search from this irq number
+ * @cnt:        Number of consecutive irqs to allocate.
+ * @node:       Preferred node on which the irq descriptor should be allocated
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+{
+        int start, ret;
+        if (!cnt)
+                return -EINVAL;
+        mutex_lock(&sparse_irq_lock);
+        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+        ret = -EEXIST;
+        if (irq >=0 && start != irq)
+                goto err;
+        ret = -ENOMEM;
+        if (start >= nr_irqs)
+                goto err;
+        bitmap_set(allocated_irqs, start, cnt);
+        mutex_unlock(&sparse_irq_lock);
+        return alloc_descs(start, cnt, node);
+err:
+        mutex_unlock(&sparse_irq_lock);
+        return ret;
+}
+/**
+ * irq_reserve_irqs - mark irqs allocated
+ * @from:       mark from irq number
+ * @cnt:        number of irqs to mark
+ *
+ * Returns 0 on success or an appropriate error code
+ */
+int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+{
+        unsigned int start;
+        int ret = 0;
+        if (!cnt || (from + cnt) > nr_irqs)
+                return -EINVAL;
+        mutex_lock(&sparse_irq_lock);
+        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+        if (start == from)
+                bitmap_set(allocated_irqs, start, cnt);
+        else
+                ret = -EEXIST;
+        mutex_unlock(&sparse_irq_lock);
+        return ret;
+}
+/**
+ * irq_get_next_irq - get next allocated irq number
+ * @offset:     where to start the search
+ *
+ * Returns next irq number after offset or nr_irqs if none is found.
+ */
+unsigned int irq_get_next_irq(unsigned int offset)
+{
+        return find_next_bit(allocated_irqs, nr_irqs, offset);
+}
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq:        irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        desc_set_defaults(irq, desc, desc_node(desc));
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        return desc ? desc->kstat_irqs[cpu] : 0;
+}
+#ifdef CONFIG_GENERIC_HARDIRQS
+unsigned int kstat_irqs(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        int cpu;
+        int sum = 0;
+        if (!desc)
+                return 0;
+        for_each_possible_cpu(cpu)
+                sum += desc->kstat_irqs[cpu];
+        return sum;
+}
+#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..644e8d5fa367 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
-            !desc->chip->set_affinity)
+            !desc->irq_data.chip->irq_set_affinity)
                return 0;
        return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_chip *chip = desc->irq_data.chip;
        unsigned long flags;
-        if (!desc->chip->set_affinity)
+        if (!chip->irq_set_affinity)
                return -EINVAL;
        raw_spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT) {
-                if (!desc->chip->set_affinity(irq, cpumask)) {
+                if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-                        cpumask_copy(desc->affinity, cpumask);
+                        cpumask_copy(desc->irq_data.affinity, cpumask);
                        irq_set_thread_affinity(desc);
                }
        }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
                cpumask_copy(desc->pending_mask, cpumask);
        }
 #else
-        if (!desc->chip->set_affinity(irq, cpumask)) {
+        if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-                cpumask_copy(desc->affinity, cpumask);
+                cpumask_copy(desc->irq_data.affinity, cpumask);
                irq_set_thread_affinity(desc);
        }
 #endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
         * one of the targets is online.
         */
        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-                if (cpumask_any_and(desc->affinity, cpu_online_mask)
+                if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
                    < nr_cpu_ids)
                        goto set_affinity;
                else
                        desc->status &= ~IRQ_AFFINITY_SET;
        }
-        cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+        cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
-        desc->chip->set_affinity(irq, desc->affinity);
+        desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
        return 0;
 }
@@ -216,14 +217,14 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 {
        if (suspend) {
-                if (!desc->action || (desc->action->flags & IRQF_TIMER))
+                if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
                        return;
                desc->status |= IRQ_SUSPENDED;
        }
        if (!desc->depth++) {
                desc->status |= IRQ_DISABLED;
-                desc->chip->disable(irq);
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
        }
 }
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        __disable_irq(desc, irq, false);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 *      IRQ line is re-enabled.
 *
 *      This function may be called from IRQ context only when
- *      desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ *      desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
 */
 void enable_irq(unsigned int irq)
 {
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;
-        if (desc->chip->set_wake)
+        if (desc->irq_data.chip->irq_set_wake)
-                ret = desc->chip->set_wake(irq, on);
+                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
        return ret;
 }
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
 }
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
-                unsigned long flags)
+                      unsigned long flags)
 {
        int ret;
-        struct irq_chip *chip = desc->chip;
+        struct irq_chip *chip = desc->irq_data.chip;
-        if (!chip || !chip->set_type) {
+        if (!chip || !chip->irq_set_type) {
                /*
                 * IRQF_TRIGGER_* but the PIC does not support multiple
                 * flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
        }
        /* caller masked out all except trigger mode flags */
-        ret = chip->set_type(irq, flags);
+        ret = chip->irq_set_type(&desc->irq_data, flags);
        if (ret)
-                pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+                pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
-                                (int)flags, irq, chip->set_type);
+                       flags, irq, chip->irq_set_type);
        else {
                if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
                        flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
                desc->status |= flags;
-                if (chip != desc->chip)
+                if (chip != desc->irq_data.chip)
-                        irq_chip_set_defaults(desc->chip);
+                        irq_chip_set_defaults(desc->irq_data.chip);
        }
        return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
 again:
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irq(&desc->lock);
        /*
@@ -521,17 +522,17 @@ again:
         */
        if (unlikely(desc->status & IRQ_INPROGRESS)) {
                raw_spin_unlock_irq(&desc->lock);
-                chip_bus_sync_unlock(irq, desc);
+                chip_bus_sync_unlock(desc);
                cpu_relax();
                goto again;
        }
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
-                desc->chip->unmask(irq);
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
        }
        raw_spin_unlock_irq(&desc->lock);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 #ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
        }
        raw_spin_lock_irq(&desc->lock);
-        cpumask_copy(mask, desc->affinity);
+        cpumask_copy(mask, desc->irq_data.affinity);
        raw_spin_unlock_irq(&desc->lock);
        set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        if (!desc)
                return -EINVAL;
-        if (desc->chip == &no_irq_chip)
+        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
        /*
         * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        if (!shared) {
-                irq_chip_set_defaults(desc->chip);
+                irq_chip_set_defaults(desc->irq_data.chip);
                init_waitqueue_head(&desc->wait_for_threads);
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (!(desc->status & IRQ_NOAUTOEN)) {
                        desc->depth = 0;
                        desc->status &= ~IRQ_DISABLED;
-                        desc->chip->startup(irq);
+                        desc->irq_data.chip->irq_startup(&desc->irq_data);
                } else
                        /* Undo nested disables: */
                        desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Currently used only by UML, might disappear one day: */
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-        if (desc->chip->release)
+        if (desc->irq_data.chip->release)
-                desc->chip->release(irq, dev_id);
+                desc->irq_data.chip->release(irq, dev_id);
 #endif
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
                desc->status |= IRQ_DISABLED;
-                if (desc->chip->shutdown)
+                if (desc->irq_data.chip->irq_shutdown)
-                        desc->chip->shutdown(irq);
+                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                else
-                        desc->chip->disable(irq);
+                        desc->irq_data.chip->irq_disable(&desc->irq_data);
        }
 #ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        kfree(__free_irq(irq, dev_id));
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(free_irq);
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->name = devname;
        action->dev_id = dev_id;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
        if (retval)
                kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..1d2541940480 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
 void move_masked_irq(int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_chip *chip = desc->irq_data.chip;
        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
                return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
        if (unlikely(cpumask_empty(desc->pending_mask)))
                return;
-        if (!desc->chip->set_affinity)
+        if (!chip->irq_set_affinity)
                return;
        assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
                   < nr_cpu_ids))
-                if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+                if (!chip->irq_set_affinity(&desc->irq_data,
-                        cpumask_copy(desc->affinity, desc->pending_mask);
+                                            desc->pending_mask, false)) {
+                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
                        irq_set_thread_affinity(desc);
                }
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
        if (unlikely(desc->status & IRQ_DISABLED))
                return;
-        desc->chip->mask(irq);
+        desc->irq_data.chip->irq_mask(&desc->irq_data);
        move_masked_irq(irq);
-        desc->chip->unmask(irq);
+        desc->irq_data.chip->irq_unmask(&desc->irq_data);
 }
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NUMA irq-desc migration code
- *
- * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
- * the new "home node" of the IRQ.
- */
-#include <linux/irq.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include "internals.h"
-static void init_copy_kstat_irqs(struct irq_desc *old_desc,
-                                 struct irq_desc *desc,
-                                 int node, int nr)
-{
-        init_kstat_irqs(desc, node, nr);
-        if (desc->kstat_irqs != old_desc->kstat_irqs)
-                memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
-                         nr * sizeof(*desc->kstat_irqs));
-}
-static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-        if (old_desc->kstat_irqs == desc->kstat_irqs)
-                return;
-        kfree(old_desc->kstat_irqs);
-        old_desc->kstat_irqs = NULL;
-}
-static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                 struct irq_desc *desc, int node)
-{
-        memcpy(desc, old_desc, sizeof(struct irq_desc));
-        if (!alloc_desc_masks(desc, node, false)) {
-                printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-                                "for migration.\n", irq);
-                return false;
-        }
-        raw_spin_lock_init(&desc->lock);
-        desc->node = node;
-        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
-        init_copy_desc_masks(old_desc, desc);
-        arch_init_copy_chip_data(old_desc, desc, node);
-        return true;
-}
-static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-        free_kstat_irqs(old_desc, desc);
-        free_desc_masks(old_desc, desc);
-        arch_free_chip_data(old_desc, desc);
-}
-static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                                int node)
-{
-        struct irq_desc *desc;
-        unsigned int irq;
-        unsigned long flags;
-        irq = old_desc->irq;
-        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-        /* We have to check it to avoid races with another CPU */
-        desc = irq_to_desc(irq);
-        if (desc && old_desc != desc)
-                goto out_unlock;
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        if (!desc) {
-                printk(KERN_ERR "irq %d: can not get new irq_desc "
-                                "for migration.\n", irq);
-                /* still use old one */
-                desc = old_desc;
-                goto out_unlock;
-        }
-        if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
-                /* still use old one */
-                kfree(desc);
-                desc = old_desc;
-                goto out_unlock;
-        }
-        replace_irq_desc(irq, desc);
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        /* free the old one */
-        free_one_irq_desc(old_desc, desc);
-        kfree(old_desc);
-        return desc;
-out_unlock:
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        return desc;
-}
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
-{
-        /* those static or target node is -1, do not move them */
-        if (desc->irq < NR_IRQS_LEGACY || node == -1)
-                return desc;
-        if (desc->node != node)
-                desc = __real_move_irq_desc(desc, node);
-        return desc;
-}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..01b1d3a88983 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long)m->private);
-        const struct cpumask *mask = desc->affinity;
+        const struct cpumask *mask = desc->irq_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
        cpumask_var_t new_value;
        int err;
-        if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
+        if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
                return -EIO;
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long) m->private);
-        seq_printf(m, "%d\n", desc->node);
+        seq_printf(m, "%d\n", desc->irq_data.node);
        return 0;
 }
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
        char name [MAX_NAMELEN];
-        if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
+        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
                return;
        memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
                         &irq_spurious_proc_fops, (void *)(long)irq);
 }
+void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+        char name [MAX_NAMELEN];
+        if (!root_irq_dir || !desc->dir)
+                return;
+#ifdef CONFIG_SMP
+        remove_proc_entry("smp_affinity", desc->dir);
+        remove_proc_entry("affinity_hint", desc->dir);
+        remove_proc_entry("node", desc->dir);
+#endif
+        remove_proc_entry("spurious", desc->dir);
+        memset(name, 0, MAX_NAMELEN);
+        sprintf(name, "%u", irq);
+        remove_proc_entry(name, root_irq_dir);
+}
 #undef MAX_NAMELEN
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..891115a929aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
        /*
         * Make sure the interrupt is enabled, before resending it:
         */
-        desc->chip->enable(irq);
+        desc->irq_data.chip->irq_enable(&desc->irq_data);
        /*
         * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
        if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
                desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
-                if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
+                if (!desc->irq_data.chip->irq_retrigger ||
+                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
                        /* Set it pending and activate the softirq: */
                        set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..3089d3b9d5f3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
 #include <linux/moduleparam.h>
 #include <linux/timer.h>
+#include "internals.h"
 static int irqfixup __read_mostly;
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
         * If we did actual work for the real IRQ line we must let the
         * IRQ controller clean up too
         */
-        if (work && desc->chip && desc->chip->end)
+        if (work)
-                desc->chip->end(irq);
+                irq_end(irq, desc);
        raw_spin_unlock(&desc->lock);
        return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
                desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
                desc->depth++;
-                desc->chip->disable(irq);
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
                mod_timer(&poll_spurious_irq_timer,
                          jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+/*
+ * An entry can be in one of four states:
+ *
+ * free      NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+#define IRQ_WORK_PENDING        1UL
+#define IRQ_WORK_BUSY           2UL
+#define IRQ_WORK_FLAGS          3UL
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+        return (unsigned long)entry->next & flags;
+}
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+        unsigned long next = (unsigned long)entry->next;
+        next &= ~IRQ_WORK_FLAGS;
+        return (struct irq_work *)next;
+}
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+        unsigned long next = (unsigned long)entry;
+        next |= flags;
+        return (struct irq_work *)next;
+}
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+        struct irq_work *next, *nflags;
+        do {
+                next = entry->next;
+                if ((unsigned long)next & IRQ_WORK_PENDING)
+                        return false;
+                nflags = next_flags(next, IRQ_WORK_FLAGS);
+        } while (cmpxchg(&entry->next, next, nflags) != next);
+        return true;
+}
+void __weak arch_irq_work_raise(void)
+{
+        /*
+         * Lame architectures will get the timer tick callback
+         */
+}
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+        struct irq_work **head, *next;
+        head = &get_cpu_var(irq_work_list);
+        do {
+                next = *head;
+                /* Can assign non-atomic because we keep the flags set. */
+                entry->next = next_flags(next, IRQ_WORK_FLAGS);
+        } while (cmpxchg(head, next, entry) != next);
+        /* The list was empty, raise self-interrupt to start processing. */
+        if (!irq_work_next(entry))
+                arch_irq_work_raise();
+        put_cpu_var(irq_work_list);
+}
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+        if (!irq_work_claim(entry)) {
+                /*
+                 * Already enqueued, can't do!
+                 */
+                return false;
+        }
+        __irq_work_queue(entry);
+        return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+        struct irq_work *list, **head;
+        head = &__get_cpu_var(irq_work_list);
+        if (*head == NULL)
+                return;
+        BUG_ON(!in_irq());
+        BUG_ON(!irqs_disabled());
+        list = xchg(head, NULL);
+        while (list != NULL) {
+                struct irq_work *entry = list;
+                list = irq_work_next(list);
+                /*
+                 * Clear the PENDING bit, after this point the @entry
+                 * can be re-used.
+                 */
+                entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+                entry->func(entry);
+                /*
+                 * Clear the BUSY bit and return to the free state if
+                 * no-one else claimed it meanwhile.
+                 */
+                cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+        }
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+        WARN_ON_ONCE(irqs_disabled());
+        while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+                cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+#ifdef HAVE_JUMP_LABEL
+#define JUMP_LABEL_HASH_BITS 6
+#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
+static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+struct jump_label_entry {
+        struct hlist_node hlist;
+        struct jump_entry *table;
+        int nr_entries;
+        /* hang modules off here */
+        struct hlist_head modules;
+        unsigned long key;
+};
+struct jump_label_module_entry {
+        struct hlist_node hlist;
+        struct jump_entry *table;
+        int nr_entries;
+        struct module *mod;
+};
+static int jump_label_cmp(const void *a, const void *b)
+{
+        const struct jump_entry *jea = a;
+        const struct jump_entry *jeb = b;
+        if (jea->key < jeb->key)
+                return -1;
+        if (jea->key > jeb->key)
+                return 1;
+        return 0;
+}
+static void
+sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+        unsigned long size;
+        size = (((unsigned long)stop - (unsigned long)start)
+                                        / sizeof(struct jump_entry));
+        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct jump_label_entry *e;
+        u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+        head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+        hlist_for_each_entry(e, node, head, hlist) {
+                if (key == e->key)
+                        return e;
+        }
+        return NULL;
+}
+static struct jump_label_entry *
+add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+{
+        struct hlist_head *head;
+        struct jump_label_entry *e;
+        u32 hash;
+        e = get_jump_label_entry(key);
+        if (e)
+                return ERR_PTR(-EEXIST);
+        e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
+        if (!e)
+                return ERR_PTR(-ENOMEM);
+        hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+        head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+        e->key = key;
+        e->table = table;
+        e->nr_entries = nr_entries;
+        INIT_HLIST_HEAD(&(e->modules));
+        hlist_add_head(&e->hlist, head);
+        return e;
+}
+static int
+build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
+{
+        struct jump_entry *iter, *iter_begin;
+        struct jump_label_entry *entry;
+        int count;
+        sort_jump_label_entries(start, stop);
+        iter = start;
+        while (iter < stop) {
+                entry = get_jump_label_entry(iter->key);
+                if (!entry) {
+                        iter_begin = iter;
+                        count = 0;
+                        while ((iter < stop) &&
+                                (iter->key == iter_begin->key)) {
+                                iter++;
+                                count++;
+                        }
+                        entry = add_jump_label_entry(iter_begin->key,
+                                                        count, iter_begin);
+                        if (IS_ERR(entry))
+                                return PTR_ERR(entry);
+                 } else {
+                        WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
+                        return -1;
+                }
+        }
+        return 0;
+}
+/***
+ * jump_label_update - update jump label text
+ * @key -  key value associated with a a jump label
+ * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
+ *
+ * Will enable/disable the jump for jump label @key, depending on the
+ * value of @type.
+ *
+ */
+void jump_label_update(unsigned long key, enum jump_label_type type)
+{
+        struct jump_entry *iter;
+        struct jump_label_entry *entry;
+        struct hlist_node *module_node;
+        struct jump_label_module_entry *e_module;
+        int count;
+        mutex_lock(&jump_label_mutex);
+        entry = get_jump_label_entry((jump_label_t)key);
+        if (entry) {
+                count = entry->nr_entries;
+                iter = entry->table;
+                while (count--) {
+                        if (kernel_text_address(iter->code))
+                                arch_jump_label_transform(iter, type);
+                        iter++;
+                }
+                /* eanble/disable jump labels in modules */
+                hlist_for_each_entry(e_module, module_node, &(entry->modules),
+                                                        hlist) {
+                        count = e_module->nr_entries;
+                        iter = e_module->table;
+                        while (count--) {
+                                if (kernel_text_address(iter->code))
+                                        arch_jump_label_transform(iter, type);
+                                iter++;
+                        }
+                }
+        }
+        mutex_unlock(&jump_label_mutex);
+}
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+        if (entry->code <= (unsigned long)end &&
+                entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+                return 1;
+        return 0;
+}
+#ifdef CONFIG_MODULES
+static int module_conflict(void *start, void *end)
+{
+        struct hlist_head *head;
+        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_label_entry *e;
+        struct jump_label_module_entry *e_module;
+        struct jump_entry *iter;
+        int i, count;
+        int conflict = 0;
+        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+                head = &jump_label_table[i];
+                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                        hlist_for_each_entry_safe(e_module, module_node,
+                                                        module_node_next,
+                                                        &(e->modules), hlist) {
+                                count = e_module->nr_entries;
+                                iter = e_module->table;
+                                while (count--) {
+                                        if (addr_conflict(iter, start, end)) {
+                                                conflict = 1;
+                                                goto out;
+                                        }
+                                        iter++;
+                                }
+                        }
+                }
+        }
+out:
+        return conflict;
+}
+#endif
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+        struct jump_entry *iter;
+        struct jump_entry *iter_start = __start___jump_table;
+        struct jump_entry *iter_stop = __start___jump_table;
+        int conflict = 0;
+        mutex_lock(&jump_label_mutex);
+        iter = iter_start;
+        while (iter < iter_stop) {
+                if (addr_conflict(iter, start, end)) {
+                        conflict = 1;
+                        goto out;
+                }
+                iter++;
+        }
+        /* now check modules */
+#ifdef CONFIG_MODULES
+        conflict = module_conflict(start, end);
+#endif
+out:
+        mutex_unlock(&jump_label_mutex);
+        return conflict;
+}
+static __init int init_jump_label(void)
+{
+        int ret;
+        struct jump_entry *iter_start = __start___jump_table;
+        struct jump_entry *iter_stop = __stop___jump_table;
+        struct jump_entry *iter;
+        mutex_lock(&jump_label_mutex);
+        ret = build_jump_label_hashtable(__start___jump_table,
+                                         __stop___jump_table);
+        iter = iter_start;
+        while (iter < iter_stop) {
+                arch_jump_label_text_poke_early(iter->code);
+                iter++;
+        }
+        mutex_unlock(&jump_label_mutex);
+        return ret;
+}
+early_initcall(init_jump_label);
+#ifdef CONFIG_MODULES
+static struct jump_label_module_entry *
+add_jump_label_module_entry(struct jump_label_entry *entry,
+                            struct jump_entry *iter_begin,
+                            int count, struct module *mod)
+{
+        struct jump_label_module_entry *e;
+        e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
+        if (!e)
+                return ERR_PTR(-ENOMEM);
+        e->mod = mod;
+        e->nr_entries = count;
+        e->table = iter_begin;
+        hlist_add_head(&e->hlist, &entry->modules);
+        return e;
+}
+static int add_jump_label_module(struct module *mod)
+{
+        struct jump_entry *iter, *iter_begin;
+        struct jump_label_entry *entry;
+        struct jump_label_module_entry *module_entry;
+        int count;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return 0;
+        sort_jump_label_entries(mod->jump_entries,
+                                mod->jump_entries + mod->num_jump_entries);
+        iter = mod->jump_entries;
+        while (iter < mod->jump_entries + mod->num_jump_entries) {
+                entry = get_jump_label_entry(iter->key);
+                iter_begin = iter;
+                count = 0;
+                while ((iter < mod->jump_entries + mod->num_jump_entries) &&
+                        (iter->key == iter_begin->key)) {
+                                iter++;
+                                count++;
+                }
+                if (!entry) {
+                        entry = add_jump_label_entry(iter_begin->key, 0, NULL);
+                        if (IS_ERR(entry))
+                                return PTR_ERR(entry);
+                }
+                module_entry = add_jump_label_module_entry(entry, iter_begin,
+                                                           count, mod);
+                if (IS_ERR(module_entry))
+                        return PTR_ERR(module_entry);
+        }
+        return 0;
+}
+static void remove_jump_label_module(struct module *mod)
+{
+        struct hlist_head *head;
+        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_label_entry *e;
+        struct jump_label_module_entry *e_module;
+        int i;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return;
+        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+                head = &jump_label_table[i];
+                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                        hlist_for_each_entry_safe(e_module, module_node,
+                                                  module_node_next,
+                                                  &(e->modules), hlist) {
+                                if (e_module->mod == mod) {
+                                        hlist_del(&e_module->hlist);
+                                        kfree(e_module);
+                                }
+                        }
+                        if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
+                                hlist_del(&e->hlist);
+                                kfree(e);
+                        }
+                }
+        }
+}
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+                         void *data)
+{
+        struct module *mod = data;
+        int ret = 0;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                mutex_lock(&jump_label_mutex);
+                ret = add_jump_label_module(mod);
+                if (ret)
+                        remove_jump_label_module(mod);
+                mutex_unlock(&jump_label_mutex);
+                break;
+        case MODULE_STATE_GOING:
+                mutex_lock(&jump_label_mutex);
+                remove_jump_label_module(mod);
+                mutex_unlock(&jump_label_mutex);
+                break;
+        }
+        return ret;
+}
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+        struct jump_entry *iter;
+        /* if the module doesn't have jump label entries, just return */
+        if (!mod->num_jump_entries)
+                return;
+        iter = mod->jump_entries;
+        while (iter < mod->jump_entries + mod->num_jump_entries) {
+                arch_jump_label_text_poke_early(iter->code);
+                iter++;
+        }
+}
+struct notifier_block jump_label_module_nb = {
+        .notifier_call = jump_label_module_notify,
+        .priority = 0,
+};
+static __init int init_jump_label_module(void)
+{
+        return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(init_jump_label_module);
+#endif /* CONFIG_MODULES */
+#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 131b1703936f..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        image->nr_segments = nr_segments;
        segment_bytes = nr_segments * sizeof(*segments);
        result = copy_from_user(image->segment, segments, segment_bytes);
-        if (result)
+        if (result) {
+                result = -EFAULT;
                goto out;
+        }
        /*
         * Verify we have good destination addresses.  The caller is
@@ -814,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                ptr = kmap(page);
                /* Start with a clear page */
-                memset(ptr, 0, PAGE_SIZE);
+                clear_page(ptr);
                ptr += maddr & ~PAGE_MASK;
                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
                if (mchunk > mbytes)
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                result = copy_from_user(ptr, buf, uchunk);
                kunmap(page);
                if (result) {
-                        result = (result < 0) ? result : -EIO;
+                        result = -EFAULT;
                        goto out;
                }
                ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                kexec_flush_icache_page(page);
                kunmap(page);
                if (result) {
-                        result = (result < 0) ? result : -EIO;
+                        result = -EFAULT;
                        goto out;
                }
                ubytes -= uchunk;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
 /*
- * A generic kernel FIFO implementation.
+ * A generic kernel FIFO implementation
 *
- * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
@@ -24,422 +23,586 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/err.h>
-#include <linux/kfifo.h>
 #include <linux/log2.h>
 #include <linux/uaccess.h>
+#include <linux/kfifo.h>
-static void _kfifo_init(struct kfifo *fifo, void *buffer,
+/*
-                unsigned int size)
+ * internal helper to calculate the unused elements in a fifo
-{
-        fifo->buffer = buffer;
-        fifo->size = size;
-        kfifo_reset(fifo);
-}
-/**
- * kfifo_init - initialize a FIFO using a preallocated buffer
- * @fifo: the fifo to assign the buffer
- * @buffer: the preallocated buffer to be used.
- * @size: the size of the internal buffer, this has to be a power of 2.
- *
 */
-void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
+static inline unsigned int kfifo_unused(struct __kfifo *fifo)
 {
-        /* size must be a power of 2 */
+        return (fifo->mask + 1) - (fifo->in - fifo->out);
-        BUG_ON(!is_power_of_2(size));
-        _kfifo_init(fifo, buffer, size);
 }
-EXPORT_SYMBOL(kfifo_init);
-/**
+int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
- * kfifo_alloc - allocates a new FIFO internal buffer
+                size_t esize, gfp_t gfp_mask)
- * @fifo: the fifo to assign then new buffer
- * @size: the size of the buffer to be allocated, this have to be a power of 2.
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- *
- * This function dynamically allocates a new fifo internal buffer
- *
- * The size will be rounded-up to a power of 2.
- * The buffer will be release with kfifo_free().
- * Return 0 if no error, otherwise the an error code
- */
-int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
 {
-        unsigned char *buffer;
        /*
-         * round up to the next power of 2, since our 'let the indices
+         * round down to the next power of 2, since our 'let the indices
         * wrap' technique works only in this case.
         */
-        if (!is_power_of_2(size)) {
+        if (!is_power_of_2(size))
-                BUG_ON(size > 0x80000000);
+                size = rounddown_pow_of_two(size);
-                size = roundup_pow_of_two(size);
+        fifo->in = 0;
+        fifo->out = 0;
+        fifo->esize = esize;
+        if (size < 2) {
+                fifo->data = NULL;
+                fifo->mask = 0;
+                return -EINVAL;
        }
-        buffer = kmalloc(size, gfp_mask);
+        fifo->data = kmalloc(size * esize, gfp_mask);
-        if (!buffer) {
-                _kfifo_init(fifo, NULL, 0);
+        if (!fifo->data) {
+                fifo->mask = 0;
                return -ENOMEM;
        }
+        fifo->mask = size - 1;
-        _kfifo_init(fifo, buffer, size);
        return 0;
 }
-EXPORT_SYMBOL(kfifo_alloc);
+EXPORT_SYMBOL(__kfifo_alloc);
-/**
+void __kfifo_free(struct __kfifo *fifo)
- * kfifo_free - frees the FIFO internal buffer
- * @fifo: the fifo to be freed.
- */
-void kfifo_free(struct kfifo *fifo)
 {
-        kfree(fifo->buffer);
+        kfree(fifo->data);
-        _kfifo_init(fifo, NULL, 0);
+        fifo->in = 0;
+        fifo->out = 0;
+        fifo->esize = 0;
+        fifo->data = NULL;
+        fifo->mask = 0;
 }
-EXPORT_SYMBOL(kfifo_free);
+EXPORT_SYMBOL(__kfifo_free);
-/**
+int __kfifo_init(struct __kfifo *fifo, void *buffer,
- * kfifo_skip - skip output data
+                unsigned int size, size_t esize)
- * @fifo: the fifo to be used.
- * @len: number of bytes to skip
- */
-void kfifo_skip(struct kfifo *fifo, unsigned int len)
 {
-        if (len < kfifo_len(fifo)) {
+        size /= esize;
-                __kfifo_add_out(fifo, len);
-                return;
+        if (!is_power_of_2(size))
+                size = rounddown_pow_of_two(size);
+        fifo->in = 0;
+        fifo->out = 0;
+        fifo->esize = esize;
+        fifo->data = buffer;
+        if (size < 2) {
+                fifo->mask = 0;
+                return -EINVAL;
        }
-        kfifo_reset_out(fifo);
+        fifo->mask = size - 1;
+        return 0;
 }
-EXPORT_SYMBOL(kfifo_skip);
+EXPORT_SYMBOL(__kfifo_init);
-static inline void __kfifo_in_data(struct kfifo *fifo,
+static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
-                const void *from, unsigned int len, unsigned int off)
+                unsigned int len, unsigned int off)
 {
+        unsigned int size = fifo->mask + 1;
+        unsigned int esize = fifo->esize;
        unsigned int l;
+        off &= fifo->mask;
+        if (esize != 1) {
+                off *= esize;
+                size *= esize;
+                len *= esize;
+        }
+        l = min(len, size - off);
+        memcpy(fifo->data + off, src, l);
+        memcpy(fifo->data, src + l, len - l);
        /*
-         * Ensure that we sample the fifo->out index -before- we
+         * make sure that the data in the fifo is up to date before
-         * start putting bytes into the kfifo.
+         * incrementing the fifo->in index counter
         */
+        smp_wmb();
+}
-        smp_mb();
+unsigned int __kfifo_in(struct __kfifo *fifo,
+                const void *buf, unsigned int len)
-        off = __kfifo_off(fifo, fifo->in + off);
+{
+        unsigned int l;
-        /* first put the data starting from fifo->in to buffer end */
+        l = kfifo_unused(fifo);
-        l = min(len, fifo->size - off);
+        if (len > l)
-        memcpy(fifo->buffer + off, from, l);
+                len = l;
-        /* then put the rest (if any) at the beginning of the buffer */
+        kfifo_copy_in(fifo, buf, len, fifo->in);
-        memcpy(fifo->buffer, from + l, len - l);
+        fifo->in += len;
+        return len;
 }
+EXPORT_SYMBOL(__kfifo_in);
-static inline void __kfifo_out_data(struct kfifo *fifo,
+static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
-                void *to, unsigned int len, unsigned int off)
+                unsigned int len, unsigned int off)
 {
+        unsigned int size = fifo->mask + 1;
+        unsigned int esize = fifo->esize;
        unsigned int l;
+        off &= fifo->mask;
+        if (esize != 1) {
+                off *= esize;
+                size *= esize;
+                len *= esize;
+        }
+        l = min(len, size - off);
+        memcpy(dst, fifo->data + off, l);
+        memcpy(dst + l, fifo->data, len - l);
        /*
-         * Ensure that we sample the fifo->in index -before- we
+         * make sure that the data is copied before
-         * start removing bytes from the kfifo.
+         * incrementing the fifo->out index counter
         */
+        smp_wmb();
+}
-        smp_rmb();
+unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+                void *buf, unsigned int len)
+{
+        unsigned int l;
-        off = __kfifo_off(fifo, fifo->out + off);
+        l = fifo->in - fifo->out;
+        if (len > l)
+                len = l;
-        /* first get the data from fifo->out until the end of the buffer */
+        kfifo_copy_out(fifo, buf, len, fifo->out);
-        l = min(len, fifo->size - off);
+        return len;
-        memcpy(to, fifo->buffer + off, l);
+}
+EXPORT_SYMBOL(__kfifo_out_peek);
-        /* then get the rest (if any) from the beginning of the buffer */
+unsigned int __kfifo_out(struct __kfifo *fifo,
-        memcpy(to + l, fifo->buffer, len - l);
+                void *buf, unsigned int len)
+{
+        len = __kfifo_out_peek(fifo, buf, len);
+        fifo->out += len;
+        return len;
 }
+EXPORT_SYMBOL(__kfifo_out);
-static inline int __kfifo_from_user_data(struct kfifo *fifo,
+static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
-         const void __user *from, unsigned int len, unsigned int off,
+        const void __user *from, unsigned int len, unsigned int off,
-         unsigned *lenout)
+        unsigned int *copied)
 {
+        unsigned int size = fifo->mask + 1;
+        unsigned int esize = fifo->esize;
        unsigned int l;
-        int ret;
+        unsigned long ret;
+        off &= fifo->mask;
+        if (esize != 1) {
+                off *= esize;
+                size *= esize;
+                len *= esize;
+        }
+        l = min(len, size - off);
+        ret = copy_from_user(fifo->data + off, from, l);
+        if (unlikely(ret))
+                ret = DIV_ROUND_UP(ret + len - l, esize);
+        else {
+                ret = copy_from_user(fifo->data, from + l, len - l);
+                if (unlikely(ret))
+                        ret = DIV_ROUND_UP(ret, esize);
+        }
        /*
-         * Ensure that we sample the fifo->out index -before- we
+         * make sure that the data in the fifo is up to date before
-         * start putting bytes into the kfifo.
+         * incrementing the fifo->in index counter
         */
+        smp_wmb();
+        *copied = len - ret;
+        /* return the number of elements which are not copied */
+        return ret;
+}
-        smp_mb();
+int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
+                unsigned long len, unsigned int *copied)
+{
+        unsigned int l;
+        unsigned long ret;
+        unsigned int esize = fifo->esize;
+        int err;
-        off = __kfifo_off(fifo, fifo->in + off);
+        if (esize != 1)
+                len /= esize;
-        /* first put the data starting from fifo->in to buffer end */
+        l = kfifo_unused(fifo);
-        l = min(len, fifo->size - off);
+        if (len > l)
-        ret = copy_from_user(fifo->buffer + off, from, l);
+                len = l;
-        if (unlikely(ret)) {
-                *lenout = ret;
-                return -EFAULT;
-        }
-        *lenout = l;
-        /* then put the rest (if any) at the beginning of the buffer */
+        ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
-        ret = copy_from_user(fifo->buffer, from + l, len - l);
+        if (unlikely(ret)) {
-        *lenout += ret ? ret : len - l;
+                len -= ret;
-        return ret ? -EFAULT : 0;
+                err = -EFAULT;
+        } else
+                err = 0;
+        fifo->in += len;
+        return err;
 }
+EXPORT_SYMBOL(__kfifo_from_user);
-static inline int __kfifo_to_user_data(struct kfifo *fifo,
+static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
-                void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
+                unsigned int len, unsigned int off, unsigned int *copied)
 {
        unsigned int l;
-        int ret;
+        unsigned long ret;
+        unsigned int size = fifo->mask + 1;
+        unsigned int esize = fifo->esize;
+        off &= fifo->mask;
+        if (esize != 1) {
+                off *= esize;
+                size *= esize;
+                len *= esize;
+        }
+        l = min(len, size - off);
+        ret = copy_to_user(to, fifo->data + off, l);
+        if (unlikely(ret))
+                ret = DIV_ROUND_UP(ret + len - l, esize);
+        else {
+                ret = copy_to_user(to + l, fifo->data, len - l);
+                if (unlikely(ret))
+                        ret = DIV_ROUND_UP(ret, esize);
+        }
        /*
-         * Ensure that we sample the fifo->in index -before- we
+         * make sure that the data is copied before
-         * start removing bytes from the kfifo.
+         * incrementing the fifo->out index counter
         */
+        smp_wmb();
+        *copied = len - ret;
+        /* return the number of elements which are not copied */
+        return ret;
+}
-        smp_rmb();
+int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
+                unsigned long len, unsigned int *copied)
+{
+        unsigned int l;
+        unsigned long ret;
+        unsigned int esize = fifo->esize;
+        int err;
-        off = __kfifo_off(fifo, fifo->out + off);
+        if (esize != 1)
+                len /= esize;
-        /* first get the data from fifo->out until the end of the buffer */
+        l = fifo->in - fifo->out;
-        l = min(len, fifo->size - off);
+        if (len > l)
-        ret = copy_to_user(to, fifo->buffer + off, l);
+                len = l;
-        *lenout = l;
+        ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
        if (unlikely(ret)) {
-                *lenout -= ret;
+                len -= ret;
-                return -EFAULT;
+                err = -EFAULT;
-        }
+        } else
+                err = 0;
+        fifo->out += len;
+        return err;
+}
+EXPORT_SYMBOL(__kfifo_to_user);
-        /* then get the rest (if any) from the beginning of the buffer */
+static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
-        len -= l;
+                int nents, unsigned int len)
-        ret = copy_to_user(to + l, fifo->buffer, len);
+{
-        if (unlikely(ret)) {
+        int n;
-                *lenout += len - ret;
+        unsigned int l;
-                return -EFAULT;
+        unsigned int off;
+        struct page *page;
+        if (!nents)
+                return 0;
+        if (!len)
+                return 0;
+        n = 0;
+        page = virt_to_page(buf);
+        off = offset_in_page(buf);
+        l = 0;
+        while (len >= l + PAGE_SIZE - off) {
+                struct page *npage;
+                l += PAGE_SIZE;
+                buf += PAGE_SIZE;
+                npage = virt_to_page(buf);
+                if (page_to_phys(page) != page_to_phys(npage) - l) {
+                        sg_set_page(sgl, page, l - off, off);
+                        sgl = sg_next(sgl);
+                        if (++n == nents || sgl == NULL)
+                                return n;
+                        page = npage;
+                        len -= l - off;
+                        l = off = 0;
+                }
        }
-        *lenout += len;
+        sg_set_page(sgl, page, len, off);
-        return 0;
+        return n + 1;
 }
-unsigned int __kfifo_in_n(struct kfifo *fifo,
+static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
-        const void *from, unsigned int len, unsigned int recsize)
+                int nents, unsigned int len, unsigned int off)
 {
-        if (kfifo_avail(fifo) < len + recsize)
+        unsigned int size = fifo->mask + 1;
-                return len + 1;
+        unsigned int esize = fifo->esize;
+        unsigned int l;
+        unsigned int n;
-        __kfifo_in_data(fifo, from, len, recsize);
+        off &= fifo->mask;
-        return 0;
+        if (esize != 1) {
+                off *= esize;
+                size *= esize;
+                len *= esize;
+        }
+        l = min(len, size - off);
+        n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
+        n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
+        return n;
 }
-EXPORT_SYMBOL(__kfifo_in_n);
-/**
+unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
- * kfifo_in - puts some data into the FIFO
+                struct scatterlist *sgl, int nents, unsigned int len)
- * @fifo: the fifo to be used.
- * @from: the data to be added.
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @from buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-unsigned int kfifo_in(struct kfifo *fifo, const void *from,
-                                unsigned int len)
 {
-        len = min(kfifo_avail(fifo), len);
+        unsigned int l;
-        __kfifo_in_data(fifo, from, len, 0);
+        l = kfifo_unused(fifo);
-        __kfifo_add_in(fifo, len);
+        if (len > l)
-        return len;
+                len = l;
+        return setup_sgl(fifo, sgl, nents, len, fifo->in);
 }
-EXPORT_SYMBOL(kfifo_in);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-unsigned int __kfifo_in_generic(struct kfifo *fifo,
+unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-        const void *from, unsigned int len, unsigned int recsize)
+                struct scatterlist *sgl, int nents, unsigned int len)
 {
-        return __kfifo_in_rec(fifo, from, len, recsize);
+        unsigned int l;
+        l = fifo->in - fifo->out;
+        if (len > l)
+                len = l;
+        return setup_sgl(fifo, sgl, nents, len, fifo->out);
 }
-EXPORT_SYMBOL(__kfifo_in_generic);
+EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-unsigned int __kfifo_out_n(struct kfifo *fifo,
+unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-        void *to, unsigned int len, unsigned int recsize)
 {
-        if (kfifo_len(fifo) < len + recsize)
+        unsigned int max = (1 << (recsize << 3)) - 1;
-                return len;
-        __kfifo_out_data(fifo, to, len, recsize);
+        if (len > max)
-        __kfifo_add_out(fifo, len + recsize);
+                return max;
-        return 0;
+        return len;
 }
-EXPORT_SYMBOL(__kfifo_out_n);
-/**
+#define __KFIFO_PEEK(data, out, mask) \
- * kfifo_out - gets some data from the FIFO
+        ((data)[(out) & (mask)])
- * @fifo: the fifo to be used.
+/*
- * @to: where the data must be copied.
+ * __kfifo_peek_n internal helper function for determinate the length of
- * @len: the size of the destination buffer.
+ * the next record in the fifo
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and returns the number of copied bytes.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
 */
-unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
+static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
 {
-        len = min(kfifo_len(fifo), len);
+        unsigned int l;
+        unsigned int mask = fifo->mask;
+        unsigned char *data = fifo->data;
-        __kfifo_out_data(fifo, to, len, 0);
+        l = __KFIFO_PEEK(data, fifo->out, mask);
-        __kfifo_add_out(fifo, len);
-        return len;
+        if (--recsize)
+                l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
+        return l;
 }
-EXPORT_SYMBOL(kfifo_out);
-/**
+#define __KFIFO_POKE(data, in, mask, val) \
- * kfifo_out_peek - copy some data from the FIFO, but do not remove it
+        ( \
- * @fifo: the fifo to be used.
+        (data)[(in) & (mask)] = (unsigned char)(val) \
- * @to: where the data must be copied.
+        )
- * @len: the size of the destination buffer.
- * @offset: offset into the fifo
+/*
- *
+ * __kfifo_poke_n internal helper function for storeing the length of
- * This function copies at most @len bytes at @offset from the FIFO
+ * the record into the fifo
- * into the @to buffer and returns the number of copied bytes.
- * The data is not removed from the FIFO.
 */
-unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
+static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-                            unsigned offset)
 {
-        len = min(kfifo_len(fifo), len + offset);
+        unsigned int mask = fifo->mask;
+        unsigned char *data = fifo->data;
-        __kfifo_out_data(fifo, to, len, offset);
+        __KFIFO_POKE(data, fifo->in, mask, n);
-        return len;
+        if (recsize > 1)
+                __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
 }
-EXPORT_SYMBOL(kfifo_out_peek);
-unsigned int __kfifo_out_generic(struct kfifo *fifo,
+unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-        void *to, unsigned int len, unsigned int recsize,
-        unsigned int *total)
 {
-        return __kfifo_out_rec(fifo, to, len, recsize, total);
+        return __kfifo_peek_n(fifo, recsize);
 }
-EXPORT_SYMBOL(__kfifo_out_generic);
+EXPORT_SYMBOL(__kfifo_len_r);
-unsigned int __kfifo_from_user_n(struct kfifo *fifo,
+unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
-        const void __user *from, unsigned int len, unsigned int recsize)
+                unsigned int len, size_t recsize)
 {
-        unsigned total;
+        if (len + recsize > kfifo_unused(fifo))
+                return 0;
-        if (kfifo_avail(fifo) < len + recsize)
+        __kfifo_poke_n(fifo, len, recsize);
-                return len + 1;
-        __kfifo_from_user_data(fifo, from, len, recsize, &total);
+        kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
-        return total;
+        fifo->in += len + recsize;
+        return len;
 }
-EXPORT_SYMBOL(__kfifo_from_user_n);
+EXPORT_SYMBOL(__kfifo_in_r);
-/**
+static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
- * kfifo_from_user - puts some data from user space into the FIFO
+        void *buf, unsigned int len, size_t recsize, unsigned int *n)
- * @fifo: the fifo to be used.
+{
- * @from: pointer to the data to be added.
+        *n = __kfifo_peek_n(fifo, recsize);
- * @len: the length of the data to be added.
- * @total: the actual returned data length.
+        if (len > *n)
- *
+                len = *n;
- * This function copies at most @len bytes from the @from into the
- * FIFO depending and returns -EFAULT/0.
+        kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
- *
+        return len;
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_from_user(struct kfifo *fifo,
-        const void __user *from, unsigned int len, unsigned *total)
-{
-        int ret;
-        len = min(kfifo_avail(fifo), len);
-        ret = __kfifo_from_user_data(fifo, from, len, 0, total);
-        if (ret)
-                return ret;
-        __kfifo_add_in(fifo, len);
-        return 0;
 }
-EXPORT_SYMBOL(kfifo_from_user);
-unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
+unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
-        const void __user *from, unsigned int len, unsigned int recsize)
+                unsigned int len, size_t recsize)
 {
-        return __kfifo_from_user_rec(fifo, from, len, recsize);
+        unsigned int n;
+        if (fifo->in == fifo->out)
+                return 0;
+        return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
 }
-EXPORT_SYMBOL(__kfifo_from_user_generic);
+EXPORT_SYMBOL(__kfifo_out_peek_r);
-unsigned int __kfifo_to_user_n(struct kfifo *fifo,
+unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
-        void __user *to, unsigned int len, unsigned int reclen,
+                unsigned int len, size_t recsize)
-        unsigned int recsize)
 {
-        unsigned int ret, total;
+        unsigned int n;
-        if (kfifo_len(fifo) < reclen + recsize)
+        if (fifo->in == fifo->out)
-                return len;
+                return 0;
-        ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
+        len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
+        fifo->out += n + recsize;
+        return len;
+}
+EXPORT_SYMBOL(__kfifo_out_r);
-        if (likely(ret == 0))
+void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-                __kfifo_add_out(fifo, reclen + recsize);
+{
+        unsigned int n;
-        return total;
+        n = __kfifo_peek_n(fifo, recsize);
+        fifo->out += n + recsize;
 }
-EXPORT_SYMBOL(__kfifo_to_user_n);
+EXPORT_SYMBOL(__kfifo_skip_r);
-/**
+int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
- * kfifo_to_user - gets data from the FIFO and write it to user space
+        unsigned long len, unsigned int *copied, size_t recsize)
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- * @lenout: pointer to output variable with copied data
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and 0 or -EFAULT.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_to_user(struct kfifo *fifo,
-        void __user *to, unsigned int len, unsigned *lenout)
 {
-        int ret;
+        unsigned long ret;
-        len = min(kfifo_len(fifo), len);
-        ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
+        len = __kfifo_max_r(len, recsize);
-        __kfifo_add_out(fifo, *lenout);
-        return ret;
+        if (len + recsize > kfifo_unused(fifo)) {
+                *copied = 0;
+                return 0;
+        }
+        __kfifo_poke_n(fifo, len, recsize);
+        ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
+        if (unlikely(ret)) {
+                *copied = 0;
+                return -EFAULT;
+        }
+        fifo->in += len + recsize;
+        return 0;
 }
-EXPORT_SYMBOL(kfifo_to_user);
+EXPORT_SYMBOL(__kfifo_from_user_r);
-unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
+int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
-        void __user *to, unsigned int len, unsigned int recsize,
+        unsigned long len, unsigned int *copied, size_t recsize)
-        unsigned int *total)
 {
-        return __kfifo_to_user_rec(fifo, to, len, recsize, total);
+        unsigned long ret;
+        unsigned int n;
+        if (fifo->in == fifo->out) {
+                *copied = 0;
+                return 0;
+        }
+        n = __kfifo_peek_n(fifo, recsize);
+        if (len > n)
+                len = n;
+        ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
+        if (unlikely(ret)) {
+                *copied = 0;
+                return -EFAULT;
+        }
+        fifo->out += n + recsize;
+        return 0;
 }
-EXPORT_SYMBOL(__kfifo_to_user_generic);
+EXPORT_SYMBOL(__kfifo_to_user_r);
-unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
+unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
 {
-        if (recsize == 0)
+        if (!nents)
-                return kfifo_avail(fifo);
+                BUG();
-        return __kfifo_peek_n(fifo, recsize);
+        len = __kfifo_max_r(len, recsize);
+        if (len + recsize > kfifo_unused(fifo))
+                return 0;
+        return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
 }
-EXPORT_SYMBOL(__kfifo_peek_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
+void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+        unsigned int len, size_t recsize)
 {
-        __kfifo_skip_rec(fifo, recsize);
+        len = __kfifo_max_r(len, recsize);
+        __kfifo_poke_n(fifo, len, recsize);
+        fifo->in += len + recsize;
 }
-EXPORT_SYMBOL(__kfifo_skip_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
+unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
+{
+        if (!nents)
+                BUG();
+        len = __kfifo_max_r(len, recsize);
+        if (len + recsize > fifo->in - fifo->out)
+                return 0;
+        return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
+}
+EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
+void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
+{
+        unsigned int len;
+        len = __kfifo_peek_n(fifo, recsize);
+        fifo->out += len + recsize;
+}
+EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6e9b19667a8d..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data)
                        goto fail;
        }
-        retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
+        retval = kernel_execve(sub_info->path,
+                               (const char *const *)sub_info->argv,
+                               (const char *const *)sub_info->envp);
        /* Exec failed? */
 fail:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..99865c33a60d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
 #include <linux/memory.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/jump_label.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobes_all_disarmed;
-static DEFINE_MUTEX(kprobe_mutex);      /* Protects kprobe_table */
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
        spinlock_t lock ____cacheline_aligned_in_smp;
@@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p)
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 {
        int i;
        struct kprobe *p = NULL;
@@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
 }
 #ifdef CONFIG_SYSCTL
+/* This should be called with kprobe_mutex locked */
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = true;
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
-        mutex_unlock(&text_mutex);
        printk(KERN_INFO "Kprobes globally optimized\n");
 }
+/* This should be called with kprobe_mutex locked */
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
                         struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        spinlock_t *hlist_lock;
@@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 static void __kprobes kretprobe_table_lock(unsigned long hash,
        unsigned long *flags)
+__acquires(hlist_lock)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
        unsigned long *flags)
+__releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        spinlock_t *hlist_lock;
@@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
        spin_unlock_irqrestore(hlist_lock, *flags);
 }
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1148,8 @@ int __kprobes register_kprobe(struct kprobe *p)
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr) ||
-            ftrace_text_reserved(p->addr, p->addr)) {
+            ftrace_text_reserved(p->addr, p->addr) ||
+            jump_label_text_reserved(p->addr, p->addr)) {
                preempt_enable();
                return -EINVAL;
        }
@@ -1339,18 +1347,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
-                unsigned long addr;
+                unsigned long addr, offset;
                jp = jps[i];
                addr = arch_deref_entry_point(jp->entry);
-                if (!kernel_text_address(addr))
+                /* Verify probepoint is a function entry point */
-                        ret = -EINVAL;
+                if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
-                else {
+                    offset == 0) {
-                        /* Todo: Verify probepoint is a function entry point */
                        jp->kp.pre_handler = setjmp_pre_handler;
                        jp->kp.break_handler = longjmp_break_handler;
                        ret = register_kprobe(&jp->kp);
-                }
+                } else
+                        ret = -EINVAL;
                if (ret < 0) {
                        if (i > 0)
                                unregister_jprobes(jps, i);
@@ -1992,6 +2001,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 static const struct file_operations fops_kp = {
        .read =         read_enabled_file_bool,
        .write =        write_enabled_file_bool,
+        .llseek =       default_llseek,
 };
 static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
 #include <trace/events/sched.h>
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
 struct kthread {
        int should_stop;
+        void *data;
        struct completion exited;
 };
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+        return to_kthread(task)->data;
+}
 static int kthread(void *_create)
 {
        /* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
        int ret;
        self.should_stop = 0;
+        self.data = data;
        init_completion(&self.exited);
        current->vfork_done = &self.exited;
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
        return 0;
 }
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker.  The started kthread will process work_list until
+ * the it is stopped with kthread_stop().  A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time.  A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+        struct kthread_worker *worker = worker_ptr;
+        struct kthread_work *work;
+        WARN_ON(worker->task);
+        worker->task = current;
+repeat:
+        set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
+        if (kthread_should_stop()) {
+                __set_current_state(TASK_RUNNING);
+                spin_lock_irq(&worker->lock);
+                worker->task = NULL;
+                spin_unlock_irq(&worker->lock);
+                return 0;
+        }
+        work = NULL;
+        spin_lock_irq(&worker->lock);
+        if (!list_empty(&worker->work_list)) {
+                work = list_first_entry(&worker->work_list,
+                                        struct kthread_work, node);
+                list_del_init(&work->node);
+        }
+        spin_unlock_irq(&worker->lock);
+        if (work) {
+                __set_current_state(TASK_RUNNING);
+                work->func(work);
+                smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
+                work->done_seq = work->queue_seq;
+                smp_mb();       /* mb worker-b1 paired with flush-b0 */
+                if (atomic_read(&work->flushing))
+                        wake_up_all(&work->done);
+        } else if (!freezing(current))
+                schedule();
+        try_to_freeze();
+        goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution.  @task
+ * must have been created with kthread_worker_create().  Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+                        struct kthread_work *work)
+{
+        bool ret = false;
+        unsigned long flags;
+        spin_lock_irqsave(&worker->lock, flags);
+        if (list_empty(&work->node)) {
+                list_add_tail(&work->node, &worker->work_list);
+                work->queue_seq++;
+                if (likely(worker->task))
+                        wake_up_process(worker->task);
+                ret = true;
+        }
+        spin_unlock_irqrestore(&worker->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+        int seq = work->queue_seq;
+        atomic_inc(&work->flushing);
+        /*
+         * mb flush-b0 paired with worker-b1, to make sure either
+         * worker sees the above increment or we see done_seq update.
+         */
+        smp_mb__after_atomic_inc();
+        /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+        wait_event(work->done, seq - work->done_seq <= 0);
+        atomic_dec(&work->flushing);
+        /*
+         * rmb flush-b1 paired with worker-b0, to make sure our caller
+         * sees every change made by work->func().
+         */
+        smp_mb__after_atomic_dec();
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+struct kthread_flush_work {
+        struct kthread_work     work;
+        struct completion       done;
+};
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+        struct kthread_flush_work *fwork =
+                container_of(work, struct kthread_flush_work, work);
+        complete(&fwork->done);
+}
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+        struct kthread_flush_work fwork = {
+                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+        };
+        queue_kthread_work(worker, &fwork.work);
+        wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
 static inline u64 lockstat_clock(void)
 {
-        return cpu_clock(smp_processor_id());
+        return local_clock();
 }
 static int lock_point(unsigned long points[], unsigned long ip)
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        }
 #endif
+        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+                debug_locks_off();
+                printk(KERN_ERR
+                        "BUG: looking up invalid subclass: %u\n", subclass);
+                printk(KERN_ERR
+                        "turning off the locking correctness validator.\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * Static locks do not have their class-keys yet - for them the key
         * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
        raw_local_irq_restore(flags);
        if (!subclass || force)
-                lock->class_cache = class;
+                lock->class_cache[0] = class;
+        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+                lock->class_cache[subclass] = class;
        if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        lock->class_cache = NULL;
+        int i;
+        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+                lock->class_cache[i] = NULL;
 #ifdef CONFIG_LOCK_STAT
        lock->cpu = raw_smp_processor_id();
 #endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
-        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
-                debug_locks_off();
-                printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
-                printk("turning off the locking correctness validator.\n");
-                dump_stack();
-                return 0;
-        }
        if (lock->key == &__lockdep_no_validate__)
                check = 1;
-        if (!subclass)
+        if (subclass < NR_LOCKDEP_CACHING_CLASSES)
-                class = lock->class_cache;
+                class = lock->class_cache[subclass];
        /*
-         * Not cached yet or subclass?
+         * Not cached?
         */
        if (unlikely(!class)) {
                class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                return 1;
        if (hlock->references) {
-                struct lock_class *class = lock->class_cache;
+                struct lock_class *class = lock->class_cache[0];
                if (!class)
                        class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                if (list_empty(head))
                        continue;
                list_for_each_entry_safe(class, next, head, hash_entry) {
-                        if (unlikely(class == lock->class_cache)) {
+                        int match = 0;
+                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+                                match |= class == lock->class_cache[j];
+                        if (unlikely(match)) {
                                if (debug_locks_off_graph_unlock())
                                        WARN_ON(1);
                                goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
 * Careful: only use this function if you are sure that
 * the task cannot run in parallel!
 */
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
 {
        if (unlikely(!debug_locks)) {
                printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
        }
        lockdep_print_held_locks(task);
 }
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-void debug_show_held_locks(struct task_struct *task)
-{
-                __debug_show_held_locks(task);
-}
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c
index 6c562828c85c..437a74a7524a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
 /*
   Copyright (C) 2002 Richard Henderson
-   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+   Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -55,6 +55,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/jump_label.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -110,6 +111,20 @@ int unregister_module_notifier(struct notifier_block * nb)
 }
 EXPORT_SYMBOL(unregister_module_notifier);
+struct load_info {
+        Elf_Ehdr *hdr;
+        unsigned long len;
+        Elf_Shdr *sechdrs;
+        char *secstrings, *strtab;
+        unsigned long *strmap;
+        unsigned long symoffs, stroffs;
+        struct _ddebug *debug;
+        unsigned int num_debug;
+        struct {
+                unsigned int sym, str, mod, vers, info, pcpu;
+        } index;
+};
 /* We require a truly strong try_module_get(): 0 means failure due to
   ongoing or failed initialization etc. */
 static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +155,38 @@ void __module_put_and_exit(struct module *mod, long code)
 EXPORT_SYMBOL(__module_put_and_exit);
 /* Find a module section: 0 means not found. */
-static unsigned int find_sec(Elf_Ehdr *hdr,
+static unsigned int find_sec(const struct load_info *info, const char *name)
-                             Elf_Shdr *sechdrs,
-                             const char *secstrings,
-                             const char *name)
 {
        unsigned int i;
-        for (i = 1; i < hdr->e_shnum; i++)
+        for (i = 1; i < info->hdr->e_shnum; i++) {
+                Elf_Shdr *shdr = &info->sechdrs[i];
                /* Alloc bit cleared means "ignore it." */
-                if ((sechdrs[i].sh_flags & SHF_ALLOC)
+                if ((shdr->sh_flags & SHF_ALLOC)
-                    && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+                    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
                        return i;
+        }
        return 0;
 }
 /* Find a module section, or NULL. */
-static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
+static void *section_addr(const struct load_info *info, const char *name)
-                          const char *secstrings, const char *name)
 {
        /* Section 0 has sh_addr 0. */
-        return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+        return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
 }
 /* Find a module section, or NULL.  Fill in number of "objects" in section. */
-static void *section_objs(Elf_Ehdr *hdr,
+static void *section_objs(const struct load_info *info,
-                          Elf_Shdr *sechdrs,
-                          const char *secstrings,
                          const char *name,
                          size_t object_size,
                          unsigned int *num)
 {
-        unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+        unsigned int sec = find_sec(info, name);
        /* Section 0 has sh_addr 0 and sh_size 0. */
-        *num = sechdrs[sec].sh_size / object_size;
+        *num = info->sechdrs[sec].sh_size / object_size;
-        return (void *)sechdrs[sec].sh_addr;
+        return (void *)info->sechdrs[sec].sh_addr;
 }
 /* Provided by the linker */
@@ -227,7 +238,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
                            unsigned int symnum, void *data), void *data)
 {
        struct module *mod;
-        const struct symsearch arr[] = {
+        static const struct symsearch arr[] = {
                { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
                  NOT_GPL_ONLY, false },
                { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +403,8 @@ static int percpu_modalloc(struct module *mod,
        mod->percpu = __alloc_reserved_percpu(size, align);
        if (!mod->percpu) {
                printk(KERN_WARNING
-                       "Could not allocate %lu bytes percpu data\n", size);
+                       "%s: Could not allocate %lu bytes percpu data\n",
+                       mod->name, size);
                return -ENOMEM;
        }
        mod->percpu_size = size;
@@ -404,11 +416,9 @@ static void percpu_modfree(struct module *mod)
        free_percpu(mod->percpu);
 }
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+static unsigned int find_pcpusec(struct load_info *info)
-                                 Elf_Shdr *sechdrs,
-                                 const char *secstrings)
 {
-        return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
+        return find_sec(info, ".data..percpu");
 }
 static void percpu_modcopy(struct module *mod,
@@ -468,9 +478,7 @@ static inline int percpu_modalloc(struct module *mod,
 static inline void percpu_modfree(struct module *mod)
 {
 }
-static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
+static unsigned int find_pcpusec(struct load_info *info)
-                                        Elf_Shdr *sechdrs,
-                                        const char *secstrings)
 {
        return 0;
 }
@@ -524,21 +532,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
 EXPORT_TRACEPOINT_SYMBOL(module_get);
 /* Init the unload section of the module. */
-static void module_unload_init(struct module *mod)
+static int module_unload_init(struct module *mod)
 {
-        int cpu;
+        mod->refptr = alloc_percpu(struct module_ref);
+        if (!mod->refptr)
+                return -ENOMEM;
        INIT_LIST_HEAD(&mod->source_list);
        INIT_LIST_HEAD(&mod->target_list);
-        for_each_possible_cpu(cpu) {
-                per_cpu_ptr(mod->refptr, cpu)->incs = 0;
-                per_cpu_ptr(mod->refptr, cpu)->decs = 0;
-        }
        /* Hold reference count during initialization. */
        __this_cpu_write(mod->refptr->incs, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
+        return 0;
 }
 /* Does a already use b? */
@@ -618,6 +626,8 @@ static void module_unload_free(struct module *mod)
                kfree(use);
        }
        mutex_unlock(&module_mutex);
+        free_percpu(mod->refptr);
 }
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -891,8 +901,9 @@ int ref_module(struct module *a, struct module *b)
 }
 EXPORT_SYMBOL_GPL(ref_module);
-static inline void module_unload_init(struct module *mod)
+static inline int module_unload_init(struct module *mod)
 {
+        return 0;
 }
 #endif /* CONFIG_MODULE_UNLOAD */
@@ -1051,10 +1062,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 #endif /* CONFIG_MODVERSIONS */
 /* Resolve a symbol for this module.  I.e. if we find one, record usage. */
-static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
-                                                  unsigned int versindex,
+                                                  const struct load_info *info,
                                                  const char *name,
-                                                  struct module *mod,
                                                  char ownername[])
 {
        struct module *owner;
@@ -1068,7 +1078,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
        if (!sym)
                goto unlock;
-        if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
+        if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
+                           owner)) {
                sym = ERR_PTR(-EINVAL);
                goto getname;
        }
@@ -1087,21 +1098,20 @@ unlock:
        return sym;
 }
-static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
+static const struct kernel_symbol *
-                                                       unsigned int versindex,
+resolve_symbol_wait(struct module *mod,
-                                                       const char *name,
+                    const struct load_info *info,
-                                                       struct module *mod)
+                    const char *name)
 {
        const struct kernel_symbol *ksym;
-        char ownername[MODULE_NAME_LEN];
+        char owner[MODULE_NAME_LEN];
        if (wait_event_interruptible_timeout(module_wq,
-                        !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
+                        !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
-                                                      mod, ownername)) ||
+                        || PTR_ERR(ksym) != -EBUSY,
-                        PTR_ERR(ksym) != -EBUSY,
                                             30 * HZ) <= 0) {
                printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
-                       mod->name, ownername);
+                       mod->name, owner);
        }
        return ksym;
 }
@@ -1110,8 +1120,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
 * /sys/module/foo/sections stuff
 * J. Corbet <corbet@lwn.net>
 */
-#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+#ifdef CONFIG_SYSFS
+#ifdef CONFIG_KALLSYMS
 static inline bool sect_empty(const Elf_Shdr *sect)
 {
        return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1148,8 +1159,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
        kfree(sect_attrs);
 }
-static void add_sect_attrs(struct module *mod, unsigned int nsect,
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
-                char *secstrings, Elf_Shdr *sechdrs)
 {
        unsigned int nloaded = 0, i, size[2];
        struct module_sect_attrs *sect_attrs;
@@ -1157,8 +1167,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        struct attribute **gattr;
        /* Count loaded sections and allocate structures */
-        for (i = 0; i < nsect; i++)
+        for (i = 0; i < info->hdr->e_shnum; i++)
-                if (!sect_empty(&sechdrs[i]))
+                if (!sect_empty(&info->sechdrs[i]))
                        nloaded++;
        size[0] = ALIGN(sizeof(*sect_attrs)
                        + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1175,11 +1185,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        sect_attrs->nsections = 0;
        sattr = &sect_attrs->attrs[0];
        gattr = &sect_attrs->grp.attrs[0];
-        for (i = 0; i < nsect; i++) {
+        for (i = 0; i < info->hdr->e_shnum; i++) {
-                if (sect_empty(&sechdrs[i]))
+                Elf_Shdr *sec = &info->sechdrs[i];
+                if (sect_empty(sec))
                        continue;
-                sattr->address = sechdrs[i].sh_addr;
+                sattr->address = sec->sh_addr;
-                sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+                sattr->name = kstrdup(info->secstrings + sec->sh_name,
                                        GFP_KERNEL);
                if (sattr->name == NULL)
                        goto out;
@@ -1247,8 +1258,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
        kfree(notes_attrs);
 }
-static void add_notes_attrs(struct module *mod, unsigned int nsect,
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
-                            char *secstrings, Elf_Shdr *sechdrs)
 {
        unsigned int notes, loaded, i;
        struct module_notes_attrs *notes_attrs;
@@ -1260,9 +1270,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
        /* Count notes sections and allocate structures.  */
        notes = 0;
-        for (i = 0; i < nsect; i++)
+        for (i = 0; i < info->hdr->e_shnum; i++)
-                if (!sect_empty(&sechdrs[i]) &&
+                if (!sect_empty(&info->sechdrs[i]) &&
-                    (sechdrs[i].sh_type == SHT_NOTE))
+                    (info->sechdrs[i].sh_type == SHT_NOTE))
                        ++notes;
        if (notes == 0)
@@ -1276,15 +1286,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
        notes_attrs->notes = notes;
        nattr = &notes_attrs->attrs[0];
-        for (loaded = i = 0; i < nsect; ++i) {
+        for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
-                if (sect_empty(&sechdrs[i]))
+                if (sect_empty(&info->sechdrs[i]))
                        continue;
-                if (sechdrs[i].sh_type == SHT_NOTE) {
+                if (info->sechdrs[i].sh_type == SHT_NOTE) {
                        sysfs_bin_attr_init(nattr);
                        nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
                        nattr->attr.mode = S_IRUGO;
-                        nattr->size = sechdrs[i].sh_size;
+                        nattr->size = info->sechdrs[i].sh_size;
-                        nattr->private = (void *) sechdrs[i].sh_addr;
+                        nattr->private = (void *) info->sechdrs[i].sh_addr;
                        nattr->read = module_notes_read;
                        ++nattr;
                }
@@ -1315,8 +1325,8 @@ static void remove_notes_attrs(struct module *mod)
 #else
-static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
+static inline void add_sect_attrs(struct module *mod,
-                char *sectstrings, Elf_Shdr *sechdrs)
+                                  const struct load_info *info)
 {
 }
@@ -1324,17 +1334,16 @@ static inline void remove_sect_attrs(struct module *mod)
 {
 }
-static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
+static inline void add_notes_attrs(struct module *mod,
-                                   char *sectstrings, Elf_Shdr *sechdrs)
+                                   const struct load_info *info)
 {
 }
 static inline void remove_notes_attrs(struct module *mod)
 {
 }
-#endif
+#endif /* CONFIG_KALLSYMS */
-#ifdef CONFIG_SYSFS
 static void add_usage_links(struct module *mod)
 {
 #ifdef CONFIG_MODULE_UNLOAD
@@ -1439,6 +1448,7 @@ out:
 }
 static int mod_sysfs_setup(struct module *mod,
+                           const struct load_info *info,
                           struct kernel_param *kparam,
                           unsigned int num_params)
 {
@@ -1463,6 +1473,8 @@ static int mod_sysfs_setup(struct module *mod,
                goto out_unreg_param;
        add_usage_links(mod);
+        add_sect_attrs(mod, info);
+        add_notes_attrs(mod, info);
        kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
        return 0;
@@ -1479,33 +1491,26 @@ out:
 static void mod_sysfs_fini(struct module *mod)
 {
+        remove_notes_attrs(mod);
+        remove_sect_attrs(mod);
        kobject_put(&mod->mkobj.kobj);
 }
-#else /* CONFIG_SYSFS */
+#else /* !CONFIG_SYSFS */
-static inline int mod_sysfs_init(struct module *mod)
-{
-        return 0;
-}
-static inline int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
+                           const struct load_info *info,
                           struct kernel_param *kparam,
                           unsigned int num_params)
 {
        return 0;
 }
-static inline int module_add_modinfo_attrs(struct module *mod)
+static void mod_sysfs_fini(struct module *mod)
-{
-        return 0;
-}
-static inline void module_remove_modinfo_attrs(struct module *mod)
 {
 }
-static void mod_sysfs_fini(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
 {
 }
@@ -1515,7 +1520,7 @@ static void del_usage_links(struct module *mod)
 #endif /* CONFIG_SYSFS */
-static void mod_kobject_remove(struct module *mod)
+static void mod_sysfs_teardown(struct module *mod)
 {
        del_usage_links(mod);
        module_remove_modinfo_attrs(mod);
@@ -1533,6 +1538,7 @@ static int __unlink_module(void *_mod)
 {
        struct module *mod = _mod;
        list_del(&mod->list);
+        module_bug_cleanup(mod);
        return 0;
 }
@@ -1545,9 +1551,7 @@ static void free_module(struct module *mod)
        mutex_lock(&module_mutex);
        stop_machine(__unlink_module, mod, NULL);
        mutex_unlock(&module_mutex);
-        remove_notes_attrs(mod);
+        mod_sysfs_teardown(mod);
-        remove_sect_attrs(mod);
-        mod_kobject_remove(mod);
        /* Remove dynamic debug info */
        ddebug_remove_module(mod->name);
@@ -1565,10 +1569,7 @@ static void free_module(struct module *mod)
        module_free(mod, mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
-#if defined(CONFIG_MODULE_UNLOAD)
-        if (mod->refptr)
-                free_percpu(mod->refptr);
-#endif
        /* Free lock-classes: */
        lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1634,25 +1635,23 @@ static int verify_export_symbols(struct module *mod)
 }
 /* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(Elf_Shdr *sechdrs,
+static int simplify_symbols(struct module *mod, const struct load_info *info)
-                            unsigned int symindex,
+{
-                            const char *strtab,
+        Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-                            unsigned int versindex,
+        Elf_Sym *sym = (void *)symsec->sh_addr;
-                            unsigned int pcpuindex,
-                            struct module *mod)
-{
-        Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
        unsigned long secbase;
-        unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+        unsigned int i;
        int ret = 0;
        const struct kernel_symbol *ksym;
-        for (i = 1; i < n; i++) {
+        for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+                const char *name = info->strtab + sym[i].st_name;
                switch (sym[i].st_shndx) {
                case SHN_COMMON:
                        /* We compiled with -fno-common.  These are not
                           supposed to happen.  */
-                        DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+                        DEBUGP("Common symbol: %s\n", name);
                        printk("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
@@ -1665,9 +1664,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                        break;
                case SHN_UNDEF:
-                        ksym = resolve_symbol_wait(sechdrs, versindex,
+                        ksym = resolve_symbol_wait(mod, info, name);
-                                                   strtab + sym[i].st_name,
-                                                   mod);
                        /* Ok if resolved.  */
                        if (ksym && !IS_ERR(ksym)) {
                                sym[i].st_value = ksym->value;
@@ -1679,17 +1676,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                                break;
                        printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
-                               mod->name, strtab + sym[i].st_name,
+                               mod->name, name, PTR_ERR(ksym));
-                               PTR_ERR(ksym));
                        ret = PTR_ERR(ksym) ?: -ENOENT;
                        break;
                default:
                        /* Divert to percpu allocation if a percpu var. */
-                        if (sym[i].st_shndx == pcpuindex)
+                        if (sym[i].st_shndx == info->index.pcpu)
                                secbase = (unsigned long)mod_percpu(mod);
                        else
-                                secbase = sechdrs[sym[i].st_shndx].sh_addr;
+                                secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
                        break;
                }
@@ -1698,6 +1694,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
        return ret;
 }
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+        unsigned int i;
+        int err = 0;
+        /* Now do relocations. */
+        for (i = 1; i < info->hdr->e_shnum; i++) {
+                unsigned int infosec = info->sechdrs[i].sh_info;
+                /* Not a valid relocation section? */
+                if (infosec >= info->hdr->e_shnum)
+                        continue;
+                /* Don't bother with non-allocated sections */
+                if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+                        continue;
+                if (info->sechdrs[i].sh_type == SHT_REL)
+                        err = apply_relocate(info->sechdrs, info->strtab,
+                                             info->index.sym, i, mod);
+                else if (info->sechdrs[i].sh_type == SHT_RELA)
+                        err = apply_relocate_add(info->sechdrs, info->strtab,
+                                                 info->index.sym, i, mod);
+                if (err < 0)
+                        break;
+        }
+        return err;
+}
 /* Additional bytes needed by arch in front of individual sections */
 unsigned int __weak arch_mod_section_prepend(struct module *mod,
                                             unsigned int section)
@@ -1722,10 +1747,7 @@ static long get_offset(struct module *mod, unsigned int *size,
   might -- code, read-only data, read-write data, small data.  Tally
   sizes, and place the offsets into sh_entsize fields: high bit means it
   belongs in init. */
-static void layout_sections(struct module *mod,
+static void layout_sections(struct module *mod, struct load_info *info)
-                            const Elf_Ehdr *hdr,
-                            Elf_Shdr *sechdrs,
-                            const char *secstrings)
 {
        static unsigned long const masks[][2] = {
                /* NOTE: all executable code must be the first section
@@ -1738,21 +1760,22 @@ static void layout_sections(struct module *mod,
        };
        unsigned int m, i;
-        for (i = 0; i < hdr->e_shnum; i++)
+        for (i = 0; i < info->hdr->e_shnum; i++)
-                sechdrs[i].sh_entsize = ~0UL;
+                info->sechdrs[i].sh_entsize = ~0UL;
        DEBUGP("Core section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-                for (i = 0; i < hdr->e_shnum; ++i) {
+                for (i = 0; i < info->hdr->e_shnum; ++i) {
-                        Elf_Shdr *s = &sechdrs[i];
+                        Elf_Shdr *s = &info->sechdrs[i];
+                        const char *sname = info->secstrings + s->sh_name;
                        if ((s->sh_flags & masks[m][0]) != masks[m][0]
                            || (s->sh_flags & masks[m][1])
                            || s->sh_entsize != ~0UL
-                            || strstarts(secstrings + s->sh_name, ".init"))
+                            || strstarts(sname, ".init"))
                                continue;
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-                        DEBUGP("\t%s\n", secstrings + s->sh_name);
+                        DEBUGP("\t%s\n", name);
                }
                if (m == 0)
                        mod->core_text_size = mod->core_size;
@@ -1760,17 +1783,18 @@ static void layout_sections(struct module *mod,
        DEBUGP("Init section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-                for (i = 0; i < hdr->e_shnum; ++i) {
+                for (i = 0; i < info->hdr->e_shnum; ++i) {
-                        Elf_Shdr *s = &sechdrs[i];
+                        Elf_Shdr *s = &info->sechdrs[i];
+                        const char *sname = info->secstrings + s->sh_name;
                        if ((s->sh_flags & masks[m][0]) != masks[m][0]
                            || (s->sh_flags & masks[m][1])
                            || s->sh_entsize != ~0UL
-                            || !strstarts(secstrings + s->sh_name, ".init"))
+                            || !strstarts(sname, ".init"))
                                continue;
                        s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
                                         | INIT_OFFSET_MASK);
-                        DEBUGP("\t%s\n", secstrings + s->sh_name);
+                        DEBUGP("\t%s\n", sname);
                }
                if (m == 0)
                        mod->init_text_size = mod->init_size;
@@ -1809,33 +1833,28 @@ static char *next_string(char *string, unsigned long *secsize)
        return string;
 }
-static char *get_modinfo(Elf_Shdr *sechdrs,
+static char *get_modinfo(struct load_info *info, const char *tag)
-                         unsigned int info,
-                         const char *tag)
 {
        char *p;
        unsigned int taglen = strlen(tag);
-        unsigned long size = sechdrs[info].sh_size;
+        Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+        unsigned long size = infosec->sh_size;
-        for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+        for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
                if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
                        return p + taglen + 1;
        }
        return NULL;
 }
-static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
+static void setup_modinfo(struct module *mod, struct load_info *info)
-                          unsigned int infoindex)
 {
        struct module_attribute *attr;
        int i;
        for (i = 0; (attr = modinfo_attrs[i]); i++) {
                if (attr->setup)
-                        attr->setup(mod,
+                        attr->setup(mod, get_modinfo(info, attr->attr.name));
-                                    get_modinfo(sechdrs,
-                                                infoindex,
-                                                attr->attr.name));
        }
 }
@@ -1876,11 +1895,10 @@ static int is_exported(const char *name, unsigned long value,
 }
 /* As per nm */
-static char elf_type(const Elf_Sym *sym,
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-                     Elf_Shdr *sechdrs,
-                     const char *secstrings,
-                     struct module *mod)
 {
+        const Elf_Shdr *sechdrs = info->sechdrs;
        if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
                if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
                        return 'v';
@@ -1910,8 +1928,10 @@ static char elf_type(const Elf_Sym *sym,
                else
                        return 'b';
        }
-        if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
+        if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+                      ".debug")) {
                return 'n';
+        }
        return '?';
 }
@@ -1936,127 +1956,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
        return true;
 }
-static unsigned long layout_symtab(struct module *mod,
+static void layout_symtab(struct module *mod, struct load_info *info)
-                                   Elf_Shdr *sechdrs,
-                                   unsigned int symindex,
-                                   unsigned int strindex,
-                                   const Elf_Ehdr *hdr,
-                                   const char *secstrings,
-                                   unsigned long *pstroffs,
-                                   unsigned long *strmap)
 {
-        unsigned long symoffs;
+        Elf_Shdr *symsect = info->sechdrs + info->index.sym;
-        Elf_Shdr *symsect = sechdrs + symindex;
+        Elf_Shdr *strsect = info->sechdrs + info->index.str;
-        Elf_Shdr *strsect = sechdrs + strindex;
        const Elf_Sym *src;
-        const char *strtab;
        unsigned int i, nsrc, ndst;
        /* Put symbol section at end of init part of module. */
        symsect->sh_flags |= SHF_ALLOC;
        symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
-                                         symindex) | INIT_OFFSET_MASK;
+                                         info->index.sym) | INIT_OFFSET_MASK;
-        DEBUGP("\t%s\n", secstrings + symsect->sh_name);
+        DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
-        src = (void *)hdr + symsect->sh_offset;
+        src = (void *)info->hdr + symsect->sh_offset;
        nsrc = symsect->sh_size / sizeof(*src);
-        strtab = (void *)hdr + strsect->sh_offset;
        for (ndst = i = 1; i < nsrc; ++i, ++src)
-                if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
+                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
                        unsigned int j = src->st_name;
-                        while(!__test_and_set_bit(j, strmap) && strtab[j])
+                        while (!__test_and_set_bit(j, info->strmap)
+                               && info->strtab[j])
                                ++j;
                        ++ndst;
                }
        /* Append room for core symbols at end of core part. */
-        symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+        info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-        mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
+        mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
        /* Put string table section at end of init part of module. */
        strsect->sh_flags |= SHF_ALLOC;
        strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
-                                         strindex) | INIT_OFFSET_MASK;
+                                         info->index.str) | INIT_OFFSET_MASK;
-        DEBUGP("\t%s\n", secstrings + strsect->sh_name);
+        DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
        /* Append room for core symbols' strings at end of core part. */
-        *pstroffs = mod->core_size;
+        info->stroffs = mod->core_size;
-        __set_bit(0, strmap);
+        __set_bit(0, info->strmap);
-        mod->core_size += bitmap_weight(strmap, strsect->sh_size);
+        mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
-        return symoffs;
 }
-static void add_kallsyms(struct module *mod,
+static void add_kallsyms(struct module *mod, const struct load_info *info)
-                         Elf_Shdr *sechdrs,
-                         unsigned int shnum,
-                         unsigned int symindex,
-                         unsigned int strindex,
-                         unsigned long symoffs,
-                         unsigned long stroffs,
-                         const char *secstrings,
-                         unsigned long *strmap)
 {
        unsigned int i, ndst;
        const Elf_Sym *src;
        Elf_Sym *dst;
        char *s;
+        Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-        mod->symtab = (void *)sechdrs[symindex].sh_addr;
+        mod->symtab = (void *)symsec->sh_addr;
-        mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+        mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
-        mod->strtab = (void *)sechdrs[strindex].sh_addr;
+        /* Make sure we get permanent strtab: don't use info->strtab. */
+        mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
        /* Set types up while we still have access to sections. */
        for (i = 0; i < mod->num_symtab; i++)
-                mod->symtab[i].st_info
+                mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-                        = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
-        mod->core_symtab = dst = mod->module_core + symoffs;
+        mod->core_symtab = dst = mod->module_core + info->symoffs;
        src = mod->symtab;
        *dst = *src;
        for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
-                if (!is_core_symbol(src, sechdrs, shnum))
+                if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
                        continue;
                dst[ndst] = *src;
-                dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
+                dst[ndst].st_name = bitmap_weight(info->strmap,
+                                                  dst[ndst].st_name);
                ++ndst;
        }
        mod->core_num_syms = ndst;
-        mod->core_strtab = s = mod->module_core + stroffs;
+        mod->core_strtab = s = mod->module_core + info->stroffs;
-        for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
+        for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
-                if (test_bit(i, strmap))
+                if (test_bit(i, info->strmap))
                        *++s = mod->strtab[i];
 }
 #else
-static inline unsigned long layout_symtab(struct module *mod,
+static inline void layout_symtab(struct module *mod, struct load_info *info)
-                                          Elf_Shdr *sechdrs,
-                                          unsigned int symindex,
-                                          unsigned int strindex,
-                                          const Elf_Ehdr *hdr,
-                                          const char *secstrings,
-                                          unsigned long *pstroffs,
-                                          unsigned long *strmap)
 {
-        return 0;
 }
-static inline void add_kallsyms(struct module *mod,
+static void add_kallsyms(struct module *mod, const struct load_info *info)
-                                Elf_Shdr *sechdrs,
-                                unsigned int shnum,
-                                unsigned int symindex,
-                                unsigned int strindex,
-                                unsigned long symoffs,
-                                unsigned long stroffs,
-                                const char *secstrings,
-                                const unsigned long *strmap)
 {
 }
 #endif /* CONFIG_KALLSYMS */
 static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 {
+        if (!debug)
+                return;
 #ifdef CONFIG_DYNAMIC_DEBUG
        if (ddebug_add_module(debug, num, debug->modname))
                printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2087,65 +2076,47 @@ static void *module_alloc_update_bounds(unsigned long size)
 }
 #ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+static void kmemleak_load_module(const struct module *mod,
-                                 Elf_Shdr *sechdrs, char *secstrings)
+                                 const struct load_info *info)
 {
        unsigned int i;
        /* only scan the sections containing data */
        kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-        for (i = 1; i < hdr->e_shnum; i++) {
+        for (i = 1; i < info->hdr->e_shnum; i++) {
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                const char *name = info->secstrings + info->sechdrs[i].sh_name;
+                if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
                        continue;
-                if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
+                if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
-                    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
                        continue;
-                kmemleak_scan_area((void *)sechdrs[i].sh_addr,
+                kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
-                                   sechdrs[i].sh_size, GFP_KERNEL);
+                                   info->sechdrs[i].sh_size, GFP_KERNEL);
        }
 }
 #else
-static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+static inline void kmemleak_load_module(const struct module *mod,
-                                        Elf_Shdr *sechdrs, char *secstrings)
+                                        const struct load_info *info)
 {
 }
 #endif
-/* Allocate and load the module: note that size of section 0 is always
+/* Sets info->hdr and info->len. */
-   zero, and we rely on this for optional sections. */
+static int copy_and_check(struct load_info *info,
-static noinline struct module *load_module(void __user *umod,
+                          const void __user *umod, unsigned long len,
-                                  unsigned long len,
+                          const char __user *uargs)
-                                  const char __user *uargs)
 {
+        int err;
        Elf_Ehdr *hdr;
-        Elf_Shdr *sechdrs;
-        char *secstrings, *args, *modmagic, *strtab = NULL;
-        char *staging;
-        unsigned int i;
-        unsigned int symindex = 0;
-        unsigned int strindex = 0;
-        unsigned int modindex, versindex, infoindex, pcpuindex;
-        struct module *mod;
-        long err = 0;
-        void *ptr = NULL; /* Stops spurious gcc warning */
-        unsigned long symoffs, stroffs, *strmap;
-        void __percpu *percpu;
-        struct _ddebug *debug = NULL;
-        unsigned int num_debug = 0;
-        mm_segment_t old_fs;
-        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
-               umod, len, uargs);
        if (len < sizeof(*hdr))
-                return ERR_PTR(-ENOEXEC);
+                return -ENOEXEC;
        /* Suck in entire file: we'll want most of it. */
        /* vmalloc barfs on "unusual" numbers.  Check here */
        if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
-                return ERR_PTR(-ENOMEM);
+                return -ENOMEM;
        if (copy_from_user(hdr, umod, len) != 0) {
                err = -EFAULT;
@@ -2153,135 +2124,230 @@ static noinline struct module *load_module(void __user *umod,
        }
        /* Sanity checks against insmoding binaries or wrong arch,
-           weird elf version */
+           weird elf version */
        if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
            || hdr->e_type != ET_REL
            || !elf_check_arch(hdr)
-            || hdr->e_shentsize != sizeof(*sechdrs)) {
+            || hdr->e_shentsize != sizeof(Elf_Shdr)) {
                err = -ENOEXEC;
                goto free_hdr;
        }
-        if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
+        if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
-                goto truncated;
+                err = -ENOEXEC;
+                goto free_hdr;
+        }
-        /* Convenience variables */
+        info->hdr = hdr;
-        sechdrs = (void *)hdr + hdr->e_shoff;
+        info->len = len;
-        secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+        return 0;
-        sechdrs[0].sh_addr = 0;
-        for (i = 1; i < hdr->e_shnum; i++) {
+free_hdr:
-                if (sechdrs[i].sh_type != SHT_NOBITS
+        vfree(hdr);
-                    && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
+        return err;
-                        goto truncated;
+}
+static void free_copy(struct load_info *info)
+{
+        vfree(info->hdr);
+}
+static int rewrite_section_headers(struct load_info *info)
+{
+        unsigned int i;
+        /* This should always be true, but let's be sure. */
+        info->sechdrs[0].sh_addr = 0;
+        for (i = 1; i < info->hdr->e_shnum; i++) {
+                Elf_Shdr *shdr = &info->sechdrs[i];
+                if (shdr->sh_type != SHT_NOBITS
+                    && info->len < shdr->sh_offset + shdr->sh_size) {
+                        printk(KERN_ERR "Module len %lu truncated\n",
+                               info->len);
+                        return -ENOEXEC;
+                }
                /* Mark all sections sh_addr with their address in the
                   temporary image. */
-                sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+                shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
-                /* Internal symbols and strings. */
-                if (sechdrs[i].sh_type == SHT_SYMTAB) {
-                        symindex = i;
-                        strindex = sechdrs[i].sh_link;
-                        strtab = (char *)hdr + sechdrs[strindex].sh_offset;
-                }
 #ifndef CONFIG_MODULE_UNLOAD
                /* Don't load .exit sections */
-                if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
+                if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
-                        sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+                        shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
        }
-        modindex = find_sec(hdr, sechdrs, secstrings,
+        /* Track but don't keep modinfo and version sections. */
-                            ".gnu.linkonce.this_module");
+        info->index.vers = find_sec(info, "__versions");
-        if (!modindex) {
+        info->index.info = find_sec(info, ".modinfo");
+        info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+        info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+        return 0;
+}
+/*
+ * Set up our basic convenience variables (pointers to section headers,
+ * search for module section index etc), and do some basic section
+ * verification.
+ *
+ * Return the temporary module pointer (we'll replace it with the final
+ * one when we move the module sections around).
+ */
+static struct module *setup_load_info(struct load_info *info)
+{
+        unsigned int i;
+        int err;
+        struct module *mod;
+        /* Set up the convenience variables */
+        info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+        info->secstrings = (void *)info->hdr
+                + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
+        err = rewrite_section_headers(info);
+        if (err)
+                return ERR_PTR(err);
+        /* Find internal symbols and strings. */
+        for (i = 1; i < info->hdr->e_shnum; i++) {
+                if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+                        info->index.sym = i;
+                        info->index.str = info->sechdrs[i].sh_link;
+                        info->strtab = (char *)info->hdr
+                                + info->sechdrs[info->index.str].sh_offset;
+                        break;
+                }
+        }
+        info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+        if (!info->index.mod) {
                printk(KERN_WARNING "No module found in object\n");
-                err = -ENOEXEC;
+                return ERR_PTR(-ENOEXEC);
-                goto free_hdr;
        }
        /* This is temporary: point mod into copy of data. */
-        mod = (void *)sechdrs[modindex].sh_addr;
+        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
-        if (symindex == 0) {
+        if (info->index.sym == 0) {
                printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
                       mod->name);
-                err = -ENOEXEC;
+                return ERR_PTR(-ENOEXEC);
-                goto free_hdr;
        }
-        versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
+        info->index.pcpu = find_pcpusec(info);
-        infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
-        pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
-        /* Don't keep modinfo and version sections. */
-        sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-        sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
        /* Check module struct version now, before we try to use module. */
-        if (!check_modstruct_version(sechdrs, versindex, mod)) {
+        if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
-                err = -ENOEXEC;
+                return ERR_PTR(-ENOEXEC);
-                goto free_hdr;
-        }
+        return mod;
+}
+static int check_modinfo(struct module *mod, struct load_info *info)
+{
+        const char *modmagic = get_modinfo(info, "vermagic");
+        int err;
-        modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
                err = try_to_force_load(mod, "bad vermagic");
                if (err)
-                        goto free_hdr;
+                        return err;
-        } else if (!same_magic(modmagic, vermagic, versindex)) {
+        } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
                printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
                       mod->name, modmagic, vermagic);
-                err = -ENOEXEC;
+                return -ENOEXEC;
-                goto free_hdr;
        }
-        staging = get_modinfo(sechdrs, infoindex, "staging");
+        if (get_modinfo(info, "staging")) {
-        if (staging) {
                add_taint_module(mod, TAINT_CRAP);
                printk(KERN_WARNING "%s: module is from the staging directory,"
                       " the quality is unknown, you have been warned.\n",
                       mod->name);
        }
-        /* Now copy in args */
+        /* Set up license info based on the info section */
-        args = strndup_user(uargs, ~0UL >> 1);
+        set_license(mod, get_modinfo(info, "license"));
-        if (IS_ERR(args)) {
-                err = PTR_ERR(args);
-                goto free_hdr;
-        }
-        strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
+        return 0;
-                         * sizeof(long), GFP_KERNEL);
+}
-        if (!strmap) {
-                err = -ENOMEM;
-                goto free_mod;
-        }
-        mod->state = MODULE_STATE_COMING;
+static void find_module_sections(struct module *mod, struct load_info *info)
+{
+        mod->kp = section_objs(info, "__param",
+                               sizeof(*mod->kp), &mod->num_kp);
+        mod->syms = section_objs(info, "__ksymtab",
+                                 sizeof(*mod->syms), &mod->num_syms);
+        mod->crcs = section_addr(info, "__kcrctab");
+        mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+                                     sizeof(*mod->gpl_syms),
+                                     &mod->num_gpl_syms);
+        mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+        mod->gpl_future_syms = section_objs(info,
+                                            "__ksymtab_gpl_future",
+                                            sizeof(*mod->gpl_future_syms),
+                                            &mod->num_gpl_future_syms);
+        mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
-        /* Allow arches to frob section contents and sizes.  */
+#ifdef CONFIG_UNUSED_SYMBOLS
-        err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
+        mod->unused_syms = section_objs(info, "__ksymtab_unused",
-        if (err < 0)
+                                        sizeof(*mod->unused_syms),
-                goto free_mod;
+                                        &mod->num_unused_syms);
+        mod->unused_crcs = section_addr(info, "__kcrctab_unused");
+        mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
+                                            sizeof(*mod->unused_gpl_syms),
+                                            &mod->num_unused_gpl_syms);
+        mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
+#endif
+#ifdef CONFIG_CONSTRUCTORS
+        mod->ctors = section_objs(info, ".ctors",
+                                  sizeof(*mod->ctors), &mod->num_ctors);
+#endif
-        if (pcpuindex) {
+#ifdef CONFIG_TRACEPOINTS
-                /* We have a special allocation for this section. */
+        mod->tracepoints = section_objs(info, "__tracepoints",
-                err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
+                                        sizeof(*mod->tracepoints),
-                                      sechdrs[pcpuindex].sh_addralign);
+                                        &mod->num_tracepoints);
-                if (err)
+#endif
-                        goto free_mod;
+#ifdef HAVE_JUMP_LABEL
-                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+        mod->jump_entries = section_objs(info, "__jump_table",
-        }
+                                        sizeof(*mod->jump_entries),
-        /* Keep this around for failure path. */
+                                        &mod->num_jump_entries);
-        percpu = mod_percpu(mod);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+        mod->trace_events = section_objs(info, "_ftrace_events",
+                                         sizeof(*mod->trace_events),
+                                         &mod->num_trace_events);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+                           mod->num_trace_events, GFP_KERNEL);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+        /* sechdrs[0].sh_size is always zero */
+        mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+                                             sizeof(*mod->ftrace_callsites),
+                                             &mod->num_ftrace_callsites);
+#endif
-        /* Determine total sizes, and put offsets in sh_entsize.  For now
+        mod->extable = section_objs(info, "__ex_table",
-           this is done generically; there doesn't appear to be any
+                                    sizeof(*mod->extable), &mod->num_exentries);
-           special cases for the architectures. */
-        layout_sections(mod, hdr, sechdrs, secstrings);
+        if (section_addr(info, "__obsparm"))
-        symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
+                printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
-                                secstrings, &stroffs, strmap);
+                       mod->name);
+        info->debug = section_objs(info, "__verbose",
+                                   sizeof(*info->debug), &info->num_debug);
+}
+static int move_module(struct module *mod, struct load_info *info)
+{
+        int i;
+        void *ptr;
        /* Do the allocs. */
        ptr = module_alloc_update_bounds(mod->core_size);
@@ -2291,10 +2357,9 @@ static noinline struct module *load_module(void __user *umod,
         * leak.
         */
        kmemleak_not_leak(ptr);
-        if (!ptr) {
+        if (!ptr)
-                err = -ENOMEM;
+                return -ENOMEM;
-                goto free_percpu;
-        }
        memset(ptr, 0, mod->core_size);
        mod->module_core = ptr;
@@ -2307,50 +2372,40 @@ static noinline struct module *load_module(void __user *umod,
         */
        kmemleak_ignore(ptr);
        if (!ptr && mod->init_size) {
-                err = -ENOMEM;
+                module_free(mod, mod->module_core);
-                goto free_core;
+                return -ENOMEM;
        }
        memset(ptr, 0, mod->init_size);
        mod->module_init = ptr;
        /* Transfer each section which specifies SHF_ALLOC */
        DEBUGP("final section addresses:\n");
-        for (i = 0; i < hdr->e_shnum; i++) {
+        for (i = 0; i < info->hdr->e_shnum; i++) {
                void *dest;
+                Elf_Shdr *shdr = &info->sechdrs[i];
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                if (!(shdr->sh_flags & SHF_ALLOC))
                        continue;
-                if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+                if (shdr->sh_entsize & INIT_OFFSET_MASK)
                        dest = mod->module_init
-                                + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+                                + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
                else
-                        dest = mod->module_core + sechdrs[i].sh_entsize;
+                        dest = mod->module_core + shdr->sh_entsize;
-                if (sechdrs[i].sh_type != SHT_NOBITS)
+                if (shdr->sh_type != SHT_NOBITS)
-                        memcpy(dest, (void *)sechdrs[i].sh_addr,
+                        memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
-                               sechdrs[i].sh_size);
                /* Update sh_addr to point to copy in image. */
-                sechdrs[i].sh_addr = (unsigned long)dest;
+                shdr->sh_addr = (unsigned long)dest;
-                DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
+                DEBUGP("\t0x%lx %s\n",
-        }
+                       shdr->sh_addr, info->secstrings + shdr->sh_name);
-        /* Module has been moved. */
-        mod = (void *)sechdrs[modindex].sh_addr;
-        kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-#if defined(CONFIG_MODULE_UNLOAD)
-        mod->refptr = alloc_percpu(struct module_ref);
-        if (!mod->refptr) {
-                err = -ENOMEM;
-                goto free_init;
        }
-#endif
-        /* Now we've moved module, initialize linked lists, etc. */
-        module_unload_init(mod);
-        /* Set up license info based on the info section */
+        return 0;
-        set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+}
+static int check_module_license_and_versions(struct module *mod)
+{
        /*
         * ndiswrapper is under GPL by itself, but loads proprietary modules.
         * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2363,77 +2418,6 @@ static noinline struct module *load_module(void __user *umod,
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
-        /* Set up MODINFO_ATTR fields */
-        setup_modinfo(mod, sechdrs, infoindex);
-        /* Fix up syms, so that st_value is a pointer to location. */
-        err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
-                               mod);
-        if (err < 0)
-                goto cleanup;
-        /* Now we've got everything in the final locations, we can
-         * find optional sections. */
-        mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
-                               sizeof(*mod->kp), &mod->num_kp);
-        mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
-                                 sizeof(*mod->syms), &mod->num_syms);
-        mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
-        mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
-                                     sizeof(*mod->gpl_syms),
-                                     &mod->num_gpl_syms);
-        mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
-        mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
-                                            "__ksymtab_gpl_future",
-                                            sizeof(*mod->gpl_future_syms),
-                                            &mod->num_gpl_future_syms);
-        mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
-                                            "__kcrctab_gpl_future");
-#ifdef CONFIG_UNUSED_SYMBOLS
-        mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
-                                        "__ksymtab_unused",
-                                        sizeof(*mod->unused_syms),
-                                        &mod->num_unused_syms);
-        mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
-                                        "__kcrctab_unused");
-        mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
-                                            "__ksymtab_unused_gpl",
-                                            sizeof(*mod->unused_gpl_syms),
-                                            &mod->num_unused_gpl_syms);
-        mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
-                                            "__kcrctab_unused_gpl");
-#endif
-#ifdef CONFIG_CONSTRUCTORS
-        mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
-                                  sizeof(*mod->ctors), &mod->num_ctors);
-#endif
-#ifdef CONFIG_TRACEPOINTS
-        mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
-                                        "__tracepoints",
-                                        sizeof(*mod->tracepoints),
-                                        &mod->num_tracepoints);
-#endif
-#ifdef CONFIG_EVENT_TRACING
-        mod->trace_events = section_objs(hdr, sechdrs, secstrings,
-                                         "_ftrace_events",
-                                         sizeof(*mod->trace_events),
-                                         &mod->num_trace_events);
-        /*
-         * This section contains pointers to allocated objects in the trace
-         * code and not scanning it leads to false positives.
-         */
-        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
-                           mod->num_trace_events, GFP_KERNEL);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
-        /* sechdrs[0].sh_size is always zero */
-        mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
-                                             "__mcount_loc",
-                                             sizeof(*mod->ftrace_callsites),
-                                             &mod->num_ftrace_callsites);
-#endif
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
            || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2443,56 +2427,16 @@ static noinline struct module *load_module(void __user *umod,
            || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
 #endif
                ) {
-                err = try_to_force_load(mod,
+                return try_to_force_load(mod,
-                                        "no versions for exported symbols");
+                                         "no versions for exported symbols");
-                if (err)
-                        goto cleanup;
        }
 #endif
+        return 0;
+}
-        /* Now do relocations. */
+static void flush_module_icache(const struct module *mod)
-        for (i = 1; i < hdr->e_shnum; i++) {
+{
-                const char *strtab = (char *)sechdrs[strindex].sh_addr;
+        mm_segment_t old_fs;
-                unsigned int info = sechdrs[i].sh_info;
-                /* Not a valid relocation section? */
-                if (info >= hdr->e_shnum)
-                        continue;
-                /* Don't bother with non-allocated sections */
-                if (!(sechdrs[info].sh_flags & SHF_ALLOC))
-                        continue;
-                if (sechdrs[i].sh_type == SHT_REL)
-                        err = apply_relocate(sechdrs, strtab, symindex, i,mod);
-                else if (sechdrs[i].sh_type == SHT_RELA)
-                        err = apply_relocate_add(sechdrs, strtab, symindex, i,
-                                                 mod);
-                if (err < 0)
-                        goto cleanup;
-        }
-        /* Set up and sort exception table */
-        mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
-                                    sizeof(*mod->extable), &mod->num_exentries);
-        sort_extable(mod->extable, mod->extable + mod->num_exentries);
-        /* Finally, copy percpu area over. */
-        percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
-                       sechdrs[pcpuindex].sh_size);
-        add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
-                     symoffs, stroffs, secstrings, strmap);
-        kfree(strmap);
-        strmap = NULL;
-        if (!mod->taints)
-                debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
-                                     sizeof(*debug), &num_debug);
-        err = module_finalize(hdr, sechdrs, mod);
-        if (err < 0)
-                goto cleanup;
        /* flush the icache in correct context */
        old_fs = get_fs();
@@ -2511,11 +2455,160 @@ static noinline struct module *load_module(void __user *umod,
                           (unsigned long)mod->module_core + mod->core_size);
        set_fs(old_fs);
+}
-        mod->args = args;
+static struct module *layout_and_allocate(struct load_info *info)
-        if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
+{
-                printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+        /* Module within temporary copy. */
-                       mod->name);
+        struct module *mod;
+        Elf_Shdr *pcpusec;
+        int err;
+        mod = setup_load_info(info);
+        if (IS_ERR(mod))
+                return mod;
+        err = check_modinfo(mod, info);
+        if (err)
+                return ERR_PTR(err);
+        /* Allow arches to frob section contents and sizes.  */
+        err = module_frob_arch_sections(info->hdr, info->sechdrs,
+                                        info->secstrings, mod);
+        if (err < 0)
+                goto out;
+        pcpusec = &info->sechdrs[info->index.pcpu];
+        if (pcpusec->sh_size) {
+                /* We have a special allocation for this section. */
+                err = percpu_modalloc(mod,
+                                      pcpusec->sh_size, pcpusec->sh_addralign);
+                if (err)
+                        goto out;
+                pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
+        }
+        /* Determine total sizes, and put offsets in sh_entsize.  For now
+           this is done generically; there doesn't appear to be any
+           special cases for the architectures. */
+        layout_sections(mod, info);
+        info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
+                         * sizeof(long), GFP_KERNEL);
+        if (!info->strmap) {
+                err = -ENOMEM;
+                goto free_percpu;
+        }
+        layout_symtab(mod, info);
+        /* Allocate and move to the final place */
+        err = move_module(mod, info);
+        if (err)
+                goto free_strmap;
+        /* Module has been copied to its final place now: return it. */
+        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+        kmemleak_load_module(mod, info);
+        return mod;
+free_strmap:
+        kfree(info->strmap);
+free_percpu:
+        percpu_modfree(mod);
+out:
+        return ERR_PTR(err);
+}
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+        kfree(info->strmap);
+        percpu_modfree(mod);
+        module_free(mod, mod->module_init);
+        module_free(mod, mod->module_core);
+}
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+        /* Sort exception table now relocations are done. */
+        sort_extable(mod->extable, mod->extable + mod->num_exentries);
+        /* Copy relocated percpu area over. */
+        percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+                       info->sechdrs[info->index.pcpu].sh_size);
+        /* Setup kallsyms-specific fields. */
+        add_kallsyms(mod, info);
+        /* Arch-specific module finalizing. */
+        return module_finalize(info->hdr, info->sechdrs, mod);
+}
+/* Allocate and load the module: note that size of section 0 is always
+   zero, and we rely on this for optional sections. */
+static struct module *load_module(void __user *umod,
+                                  unsigned long len,
+                                  const char __user *uargs)
+{
+        struct load_info info = { NULL, };
+        struct module *mod;
+        long err;
+        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+               umod, len, uargs);
+        /* Copy in the blobs from userspace, check they are vaguely sane. */
+        err = copy_and_check(&info, umod, len, uargs);
+        if (err)
+                return ERR_PTR(err);
+        /* Figure out module layout, and allocate all the memory. */
+        mod = layout_and_allocate(&info);
+        if (IS_ERR(mod)) {
+                err = PTR_ERR(mod);
+                goto free_copy;
+        }
+        /* Now module is in final location, initialize linked lists, etc. */
+        err = module_unload_init(mod);
+        if (err)
+                goto free_module;
+        /* Now we've got everything in the final locations, we can
+         * find optional sections. */
+        find_module_sections(mod, &info);
+        err = check_module_license_and_versions(mod);
+        if (err)
+                goto free_unload;
+        /* Set up MODINFO_ATTR fields */
+        setup_modinfo(mod, &info);
+        /* Fix up syms, so that st_value is a pointer to location. */
+        err = simplify_symbols(mod, &info);
+        if (err < 0)
+                goto free_modinfo;
+        err = apply_relocations(mod, &info);
+        if (err < 0)
+                goto free_modinfo;
+        err = post_relocation(mod, &info);
+        if (err < 0)
+                goto free_modinfo;
+        flush_module_icache(mod);
+        /* Now copy in args */
+        mod->args = strndup_user(uargs, ~0UL >> 1);
+        if (IS_ERR(mod->args)) {
+                err = PTR_ERR(mod->args);
+                goto free_arch_cleanup;
+        }
+        /* Mark state as coming so strong_try_module_get() ignores us. */
+        mod->state = MODULE_STATE_COMING;
        /* Now sew it into the lists so we can get lockdep and oops
         * info during argument parsing.  Noone should access us, since
@@ -2530,70 +2623,61 @@ static noinline struct module *load_module(void __user *umod,
                goto unlock;
        }
-        if (debug)
+        /* This has to be done once we're sure module name is unique. */
-                dynamic_debug_setup(debug, num_debug);
+        if (!mod->taints)
+                dynamic_debug_setup(info.debug, info.num_debug);
        /* Find duplicate symbols */
        err = verify_export_symbols(mod);
        if (err < 0)
                goto ddebug;
+        module_bug_finalize(info.hdr, info.sechdrs, mod);
        list_add_rcu(&mod->list, &modules);
        mutex_unlock(&module_mutex);
+        /* Module is ready to execute: parsing args may do that. */
        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
        if (err < 0)
                goto unlink;
-        err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
+        /* Link in to syfs. */
+        err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
        if (err < 0)
                goto unlink;
-        add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+        /* Get rid of temporary copy and strmap. */
-        add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+        kfree(info.strmap);
+        free_copy(&info);
-        /* Get rid of temporary copy */
-        vfree(hdr);
-        trace_module_load(mod);
        /* Done! */
+        trace_module_load(mod);
        return mod;
 unlink:
        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
+        module_bug_cleanup(mod);
 ddebug:
-        dynamic_debug_remove(debug);
+        if (!mod->taints)
+                dynamic_debug_remove(info.debug);
 unlock:
        mutex_unlock(&module_mutex);
        synchronize_sched();
+        kfree(mod->args);
+ free_arch_cleanup:
        module_arch_cleanup(mod);
- cleanup:
+ free_modinfo:
        free_modinfo(mod);
+ free_unload:
        module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD)
+ free_module:
-        free_percpu(mod->refptr);
+        module_deallocate(mod, &info);
- free_init:
+ free_copy:
-#endif
+        free_copy(&info);
-        module_free(mod, mod->module_init);
- free_core:
-        module_free(mod, mod->module_core);
-        /* mod will be freed with core. Don't access it beyond this line! */
- free_percpu:
-        free_percpu(percpu);
- free_mod:
-        kfree(args);
-        kfree(strmap);
- free_hdr:
-        vfree(hdr);
        return ERR_PTR(err);
- truncated:
-        printk(KERN_ERR "Module len %lu truncated\n", len);
-        err = -ENOEXEC;
-        goto free_hdr;
 }
 /* Call module constructors. */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4c0b7b3e6d2e..200407c1502f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
 # include <asm/mutex.h>
 #endif
-/***
- * mutex_init - initialize the mutex
- * @lock: the mutex to be initialized
- * @key: the lock_class_key for the class; used by mutex lock debugging
- *
- * Initialize the mutex to unlocked state.
- *
- * It is not allowed to initialize an already locked mutex.
- */
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
 static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
-/***
+/**
 * mutex_lock - acquire the mutex
 * @lock: the mutex to be acquired
 *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
 static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
-/***
+/**
 * mutex_unlock - release the mutex
 * @lock: the mutex to be released
 *
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
-/***
+/**
- * mutex_lock_interruptible - acquire the mutex, interruptable
+ * mutex_lock_interruptible - acquire the mutex, interruptible
 * @lock: the mutex to be acquired
 *
 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
        return prev == 1;
 }
-/***
+/**
- * mutex_trylock - try acquire the mutex, without waiting
+ * mutex_trylock - try to acquire the mutex, without waiting
 * @lock: the mutex to be acquired
 *
 * Try to acquire the mutex atomically. Returns 1 if the mutex
 * has been acquired successfully, and 0 on contention.
 *
 * NOTE: this function follows the spin_trylock() convention, so
- * it is negated to the down_trylock() return values! Be careful
+ * it is negated from the down_trylock() return values! Be careful
 * about this when converting semaphore users to mutexes.
 *
 * This function must not be used in interrupt context. The
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
                return ERR_PTR(-EPERM);
        if (!cgroup_is_descendant(cgroup, current))
                return ERR_PTR(-EPERM);
+        if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
+                printk("ns_cgroup can't be created with parent "
+                       "'clone_children' set.\n");
+                return ERR_PTR(-EINVAL);
+        }
+        printk_once("ns_cgroup deprecated: consider using the "
+                    "'clone_children' flag without the ns_cgroup.\n");
        ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
        if (!ns_cgroup)
diff --git a/kernel/padata.c b/kernel/padata.c
index fdd8ae609ce3..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/rcupdate.h>
-#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
 #define MAX_OBJ_NUM 1000
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
        int cpu, target_cpu;
-        target_cpu = cpumask_first(pd->cpumask);
+        target_cpu = cpumask_first(pd->cpumask.pcpu);
        for (cpu = 0; cpu < cpu_index; cpu++)
-                target_cpu = cpumask_next(target_cpu, pd->cpumask);
+                target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
        return target_cpu;
 }
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
         * Hash the sequence numbers to the cpus by taking
         * seq_nr mod. number of cpus in use.
         */
-        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
        return padata_index_to_cpu(pd, cpu_index);
 }
-static void padata_parallel_worker(struct work_struct *work)
+static void padata_parallel_worker(struct work_struct *parallel_work)
 {
-        struct padata_queue *queue;
+        struct padata_parallel_queue *pqueue;
        struct parallel_data *pd;
        struct padata_instance *pinst;
        LIST_HEAD(local_list);
        local_bh_disable();
-        queue = container_of(work, struct padata_queue, pwork);
+        pqueue = container_of(parallel_work,
-        pd = queue->pd;
+                              struct padata_parallel_queue, work);
+        pd = pqueue->pd;
        pinst = pd->pinst;
-        spin_lock(&queue->parallel.lock);
+        spin_lock(&pqueue->parallel.lock);
-        list_replace_init(&queue->parallel.list, &local_list);
+        list_replace_init(&pqueue->parallel.list, &local_list);
-        spin_unlock(&queue->parallel.lock);
+        spin_unlock(&pqueue->parallel.lock);
        while (!list_empty(&local_list)) {
                struct padata_priv *padata;
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work)
 * @pinst: padata instance
 * @padata: object to be parallelized
 * @cb_cpu: cpu the serialization callback function will run on,
- *          must be in the cpumask of padata.
+ *          must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
 *
 * The parallelization callback function will run with BHs off.
 * Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
                       struct padata_priv *padata, int cb_cpu)
 {
        int target_cpu, err;
-        struct padata_queue *queue;
+        struct padata_parallel_queue *queue;
        struct parallel_data *pd;
        rcu_read_lock_bh();
        pd = rcu_dereference(pinst->pd);
-        err = 0;
+        err = -EINVAL;
-        if (!(pinst->flags & PADATA_INIT))
+        if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
+                goto out;
+        if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
                goto out;
        err =  -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
        if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
                goto out;
-        err = -EINVAL;
+        err = 0;
-        if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
-                goto out;
-        err = -EINPROGRESS;
        atomic_inc(&pd->refcnt);
        padata->pd = pd;
        padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
        target_cpu = padata_cpu_hash(padata);
-        queue = per_cpu_ptr(pd->queue, target_cpu);
+        queue = per_cpu_ptr(pd->pqueue, target_cpu);
        spin_lock(&queue->parallel.lock);
        list_add_tail(&padata->list, &queue->parallel.list);
        spin_unlock(&queue->parallel.lock);
-        queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+        queue_work_on(target_cpu, pinst->wq, &queue->work);
 out:
        rcu_read_unlock_bh();
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel);
 */
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
-        int cpu, num_cpus, empty, calc_seq_nr;
+        int cpu, num_cpus;
-        int seq_nr, next_nr, overrun, next_overrun;
+        int next_nr, next_index;
-        struct padata_queue *queue, *next_queue;
+        struct padata_parallel_queue *queue, *next_queue;
        struct padata_priv *padata;
        struct padata_list *reorder;
-        empty = 0;
+        num_cpus = cpumask_weight(pd->cpumask.pcpu);
-        next_nr = -1;
-        next_overrun = 0;
-        next_queue = NULL;
-        num_cpus = cpumask_weight(pd->cpumask);
-        for_each_cpu(cpu, pd->cpumask) {
-                queue = per_cpu_ptr(pd->queue, cpu);
-                reorder = &queue->reorder;
-                /*
-                 * Calculate the seq_nr of the object that should be
-                 * next in this reorder queue.
-                 */
-                overrun = 0;
-                calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
-                               + queue->cpu_index;
-                if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+        /*
-                        calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+         * Calculate the percpu reorder queue and the sequence
-                        overrun = 1;
+         * number of the next object.
-                }
+         */
+        next_nr = pd->processed;
-                if (!list_empty(&reorder->list)) {
+        next_index = next_nr % num_cpus;
-                        padata = list_entry(reorder->list.next,
+        cpu = padata_index_to_cpu(pd, next_index);
-                                            struct padata_priv, list);
+        next_queue = per_cpu_ptr(pd->pqueue, cpu);
-                        seq_nr  = padata->seq_nr;
+        if (unlikely(next_nr > pd->max_seq_nr)) {
-                        BUG_ON(calc_seq_nr != seq_nr);
+                next_nr = next_nr - pd->max_seq_nr - 1;
-                } else {
+                next_index = next_nr % num_cpus;
-                        seq_nr = calc_seq_nr;
+                cpu = padata_index_to_cpu(pd, next_index);
-                        empty++;
+                next_queue = per_cpu_ptr(pd->pqueue, cpu);
-                }
+                pd->processed = 0;
-                if (next_nr < 0 || seq_nr < next_nr
-                    || (next_overrun && !overrun)) {
-                        next_nr = seq_nr;
-                        next_overrun = overrun;
-                        next_queue = queue;
-                }
        }
        padata = NULL;
-        if (empty == num_cpus)
-                goto out;
        reorder = &next_queue->reorder;
        if (!list_empty(&reorder->list)) {
                padata = list_entry(reorder->list.next,
                                    struct padata_priv, list);
-                if (unlikely(next_overrun)) {
+                BUG_ON(next_nr != padata->seq_nr);
-                        for_each_cpu(cpu, pd->cpumask) {
-                                queue = per_cpu_ptr(pd->queue, cpu);
-                                atomic_set(&queue->num_obj, 0);
-                        }
-                }
                spin_lock(&reorder->lock);
                list_del_init(&padata->list);
                atomic_dec(&pd->reorder_objects);
                spin_unlock(&reorder->lock);
-                atomic_inc(&next_queue->num_obj);
+                pd->processed++;
                goto out;
        }
-        queue = per_cpu_ptr(pd->queue, smp_processor_id());
+        queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
        if (queue->cpu_index == next_queue->cpu_index) {
                padata = ERR_PTR(-ENODATA);
                goto out;
@@ -262,7 +231,7 @@ out:
 static void padata_reorder(struct parallel_data *pd)
 {
        struct padata_priv *padata;
-        struct padata_queue *queue;
+        struct padata_serial_queue *squeue;
        struct padata_instance *pinst = pd->pinst;
        /*
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
                        return;
                }
-                queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+                squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
-                spin_lock(&queue->serial.lock);
+                spin_lock(&squeue->serial.lock);
-                list_add_tail(&padata->list, &queue->serial.list);
+                list_add_tail(&padata->list, &squeue->serial.list);
-                spin_unlock(&queue->serial.lock);
+                spin_unlock(&squeue->serial.lock);
-                queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+                queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
        }
        spin_unlock_bh(&pd->lock);
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg)
        padata_reorder(pd);
 }
-static void padata_serial_worker(struct work_struct *work)
+static void padata_serial_worker(struct work_struct *serial_work)
 {
-        struct padata_queue *queue;
+        struct padata_serial_queue *squeue;
        struct parallel_data *pd;
        LIST_HEAD(local_list);
        local_bh_disable();
-        queue = container_of(work, struct padata_queue, swork);
+        squeue = container_of(serial_work, struct padata_serial_queue, work);
-        pd = queue->pd;
+        pd = squeue->pd;
-        spin_lock(&queue->serial.lock);
+        spin_lock(&squeue->serial.lock);
-        list_replace_init(&queue->serial.list, &local_list);
+        list_replace_init(&squeue->serial.list, &local_list);
-        spin_unlock(&queue->serial.lock);
+        spin_unlock(&squeue->serial.lock);
        while (!list_empty(&local_list)) {
                struct padata_priv *padata;
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
 void padata_do_serial(struct padata_priv *padata)
 {
        int cpu;
-        struct padata_queue *queue;
+        struct padata_parallel_queue *pqueue;
        struct parallel_data *pd;
        pd = padata->pd;
        cpu = get_cpu();
-        queue = per_cpu_ptr(pd->queue, cpu);
+        pqueue = per_cpu_ptr(pd->pqueue, cpu);
-        spin_lock(&queue->reorder.lock);
+        spin_lock(&pqueue->reorder.lock);
        atomic_inc(&pd->reorder_objects);
-        list_add_tail(&padata->list, &queue->reorder.list);
+        list_add_tail(&padata->list, &pqueue->reorder.list);
-        spin_unlock(&queue->reorder.lock);
+        spin_unlock(&pqueue->reorder.lock);
        put_cpu();
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata)
 }
 EXPORT_SYMBOL(padata_do_serial);
-/* Allocate and initialize the internal cpumask dependend resources. */
+static int padata_setup_cpumasks(struct parallel_data *pd,
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                 const struct cpumask *pcpumask,
-                                             const struct cpumask *cpumask)
+                                 const struct cpumask *cbcpumask)
 {
-        int cpu, cpu_index, num_cpus;
+        if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
-        struct padata_queue *queue;
+                return -ENOMEM;
-        struct parallel_data *pd;
-        cpu_index = 0;
-        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
-        if (!pd)
+        if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
-                goto err;
+                free_cpumask_var(pd->cpumask.cbcpu);
+                return -ENOMEM;
+        }
-        pd->queue = alloc_percpu(struct padata_queue);
+        cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
-        if (!pd->queue)
+        return 0;
-                goto err_free_pd;
+}
-        if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+static void __padata_list_init(struct padata_list *pd_list)
-                goto err_free_queue;
+{
+        INIT_LIST_HEAD(&pd_list->list);
+        spin_lock_init(&pd_list->lock);
+}
-        cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+/* Initialize all percpu queues used by serial workers */
+static void padata_init_squeues(struct parallel_data *pd)
+{
+        int cpu;
+        struct padata_serial_queue *squeue;
-        for_each_cpu(cpu, pd->cpumask) {
+        for_each_cpu(cpu, pd->cpumask.cbcpu) {
-                queue = per_cpu_ptr(pd->queue, cpu);
+                squeue = per_cpu_ptr(pd->squeue, cpu);
+                squeue->pd = pd;
+                __padata_list_init(&squeue->serial);
+                INIT_WORK(&squeue->work, padata_serial_worker);
+        }
+}
-                queue->pd = pd;
+/* Initialize all percpu queues used by parallel workers */
+static void padata_init_pqueues(struct parallel_data *pd)
+{
+        int cpu_index, num_cpus, cpu;
+        struct padata_parallel_queue *pqueue;
-                queue->cpu_index = cpu_index;
+        cpu_index = 0;
+        for_each_cpu(cpu, pd->cpumask.pcpu) {
+                pqueue = per_cpu_ptr(pd->pqueue, cpu);
+                pqueue->pd = pd;
+                pqueue->cpu_index = cpu_index;
                cpu_index++;
-                INIT_LIST_HEAD(&queue->reorder.list);
+                __padata_list_init(&pqueue->reorder);
-                INIT_LIST_HEAD(&queue->parallel.list);
+                __padata_list_init(&pqueue->parallel);
-                INIT_LIST_HEAD(&queue->serial.list);
+                INIT_WORK(&pqueue->work, padata_parallel_worker);
-                spin_lock_init(&queue->reorder.lock);
+                atomic_set(&pqueue->num_obj, 0);
-                spin_lock_init(&queue->parallel.lock);
-                spin_lock_init(&queue->serial.lock);
-                INIT_WORK(&queue->pwork, padata_parallel_worker);
-                INIT_WORK(&queue->swork, padata_serial_worker);
-                atomic_set(&queue->num_obj, 0);
        }
-        num_cpus = cpumask_weight(pd->cpumask);
+        num_cpus = cpumask_weight(pd->cpumask.pcpu);
-        pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+        pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
+}
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                             const struct cpumask *pcpumask,
+                                             const struct cpumask *cbcpumask)
+{
+        struct parallel_data *pd;
+        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        if (!pd)
+                goto err;
+        pd->pqueue = alloc_percpu(struct padata_parallel_queue);
+        if (!pd->pqueue)
+                goto err_free_pd;
+        pd->squeue = alloc_percpu(struct padata_serial_queue);
+        if (!pd->squeue)
+                goto err_free_pqueue;
+        if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+                goto err_free_squeue;
+        padata_init_pqueues(pd);
+        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
        atomic_set(&pd->seq_nr, -1);
        atomic_set(&pd->reorder_objects, 0);
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        return pd;
-err_free_queue:
+err_free_squeue:
-        free_percpu(pd->queue);
+        free_percpu(pd->squeue);
+err_free_pqueue:
+        free_percpu(pd->pqueue);
 err_free_pd:
        kfree(pd);
 err:
@@ -456,8 +464,10 @@ err:
 static void padata_free_pd(struct parallel_data *pd)
 {
-        free_cpumask_var(pd->cpumask);
+        free_cpumask_var(pd->cpumask.pcpu);
-        free_percpu(pd->queue);
+        free_cpumask_var(pd->cpumask.cbcpu);
+        free_percpu(pd->pqueue);
+        free_percpu(pd->squeue);
        kfree(pd);
 }
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd)
 static void padata_flush_queues(struct parallel_data *pd)
 {
        int cpu;
-        struct padata_queue *queue;
+        struct padata_parallel_queue *pqueue;
+        struct padata_serial_queue *squeue;
-        for_each_cpu(cpu, pd->cpumask) {
+        for_each_cpu(cpu, pd->cpumask.pcpu) {
-                queue = per_cpu_ptr(pd->queue, cpu);
+                pqueue = per_cpu_ptr(pd->pqueue, cpu);
-                flush_work(&queue->pwork);
+                flush_work(&pqueue->work);
        }
        del_timer_sync(&pd->timer);
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd)
        if (atomic_read(&pd->reorder_objects))
                padata_reorder(pd);
-        for_each_cpu(cpu, pd->cpumask) {
+        for_each_cpu(cpu, pd->cpumask.cbcpu) {
-                queue = per_cpu_ptr(pd->queue, cpu);
+                squeue = per_cpu_ptr(pd->squeue, cpu);
-                flush_work(&queue->swork);
+                flush_work(&squeue->work);
        }
        BUG_ON(atomic_read(&pd->refcnt) != 0);
 }
+static void __padata_start(struct padata_instance *pinst)
+{
+        pinst->flags |= PADATA_INIT;
+}
+static void __padata_stop(struct padata_instance *pinst)
+{
+        if (!(pinst->flags & PADATA_INIT))
+                return;
+        pinst->flags &= ~PADATA_INIT;
+        synchronize_rcu();
+        get_online_cpus();
+        padata_flush_queues(pinst->pd);
+        put_online_cpus();
+}
 /* Replace the internal control stucture with a new one. */
 static void padata_replace(struct padata_instance *pinst,
                           struct parallel_data *pd_new)
 {
        struct parallel_data *pd_old = pinst->pd;
+        int notification_mask = 0;
        pinst->flags |= PADATA_RESET;
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
        synchronize_rcu();
+        if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
+                notification_mask |= PADATA_CPU_PARALLEL;
+        if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
+                notification_mask |= PADATA_CPU_SERIAL;
        padata_flush_queues(pd_old);
        padata_free_pd(pd_old);
+        if (notification_mask)
+                blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
+                                             notification_mask,
+                                             &pd_new->cpumask);
        pinst->flags &= ~PADATA_RESET;
 }
 /**
- * padata_set_cpumask - set the cpumask that padata should use
+ * padata_register_cpumask_notifier - Registers a notifier that will be called
+ *                             if either pcpu or cbcpu or both cpumasks change.
 *
- * @pinst: padata instance
+ * @pinst: A poineter to padata instance
- * @cpumask: the cpumask to use
+ * @nblock: A pointer to notifier block.
 */
-int padata_set_cpumask(struct padata_instance *pinst,
+int padata_register_cpumask_notifier(struct padata_instance *pinst,
-                        cpumask_var_t cpumask)
+                                     struct notifier_block *nblock)
 {
+        return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
+                                                nblock);
+}
+EXPORT_SYMBOL(padata_register_cpumask_notifier);
+/**
+ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
+ *        registered earlier  using padata_register_cpumask_notifier
+ *
+ * @pinst: A pointer to data instance.
+ * @nlock: A pointer to notifier block.
+ */
+int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+                                       struct notifier_block *nblock)
+{
+        return blocking_notifier_chain_unregister(
+                &pinst->cpumask_change_notifier,
+                nblock);
+}
+EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
+/* If cpumask contains no active cpu, we mark the instance as invalid. */
+static bool padata_validate_cpumask(struct padata_instance *pinst,
+                                    const struct cpumask *cpumask)
+{
+        if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+                pinst->flags |= PADATA_INVALID;
+                return false;
+        }
+        pinst->flags &= ~PADATA_INVALID;
+        return true;
+}
+static int __padata_set_cpumasks(struct padata_instance *pinst,
+                                 cpumask_var_t pcpumask,
+                                 cpumask_var_t cbcpumask)
+{
+        int valid;
        struct parallel_data *pd;
-        int err = 0;
+        valid = padata_validate_cpumask(pinst, pcpumask);
+        if (!valid) {
+                __padata_stop(pinst);
+                goto out_replace;
+        }
+        valid = padata_validate_cpumask(pinst, cbcpumask);
+        if (!valid)
+                __padata_stop(pinst);
+out_replace:
+        pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+        if (!pd)
+                return -ENOMEM;
+        cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+        cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+        padata_replace(pinst, pd);
+        if (valid)
+                __padata_start(pinst);
+        return 0;
+}
+/**
+ * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ *                       one is used by parallel workers and the second one
+ *                       by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
+                        cpumask_var_t cbcpumask)
+{
+        int err;
        mutex_lock(&pinst->lock);
+        get_online_cpus();
+        err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
+        put_online_cpus();
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_set_cpumasks);
+/**
+ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
+ *                     equivalent to @cpumask.
+ *
+ * @pinst: padata instance
+ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
+ *                to parallel and serial cpumasks respectively.
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+                       cpumask_var_t cpumask)
+{
+        struct cpumask *serial_mask, *parallel_mask;
+        int err = -EINVAL;
+        mutex_lock(&pinst->lock);
        get_online_cpus();
-        pd = padata_alloc_pd(pinst, cpumask);
+        switch (cpumask_type) {
-        if (!pd) {
+        case PADATA_CPU_PARALLEL:
-                err = -ENOMEM;
+                serial_mask = pinst->cpumask.cbcpu;
-                goto out;
+                parallel_mask = cpumask;
+                break;
+        case PADATA_CPU_SERIAL:
+                parallel_mask = pinst->cpumask.pcpu;
+                serial_mask = cpumask;
+                break;
+        default:
+                 goto out;
        }
-        cpumask_copy(pinst->cpumask, cpumask);
+        err =  __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
-        padata_replace(pinst, pd);
 out:
        put_online_cpus();
        mutex_unlock(&pinst->lock);
        return err;
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
        struct parallel_data *pd;
        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
-                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+                                     pinst->cpumask.cbcpu);
                if (!pd)
                        return -ENOMEM;
                padata_replace(pinst, pd);
+                if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
+                    padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+                        __padata_start(pinst);
        }
        return 0;
 }
-/**
+ /**
- * padata_add_cpu - add a cpu to the padata cpumask
+ * padata_add_cpu - add a cpu to one or both(parallel and serial)
+ *                  padata cpumasks.
 *
 * @pinst: padata instance
 * @cpu: cpu to add
+ * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
+ *        The @mask may be any combination of the following flags:
+ *          PADATA_CPU_SERIAL   - serial cpumask
+ *          PADATA_CPU_PARALLEL - parallel cpumask
 */
-int padata_add_cpu(struct padata_instance *pinst, int cpu)
+int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
 {
        int err;
+        if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+                return -EINVAL;
        mutex_lock(&pinst->lock);
        get_online_cpus();
-        cpumask_set_cpu(cpu, pinst->cpumask);
+        if (mask & PADATA_CPU_SERIAL)
+                cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
+        if (mask & PADATA_CPU_PARALLEL)
+                cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
        err = __padata_add_cpu(pinst, cpu);
        put_online_cpus();
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
 static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 {
-        struct parallel_data *pd;
+        struct parallel_data *pd = NULL;
        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
-                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
+                    !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+                        __padata_stop(pinst);
+                pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+                                     pinst->cpumask.cbcpu);
                if (!pd)
                        return -ENOMEM;
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
        return 0;
 }
-/**
+ /**
- * padata_remove_cpu - remove a cpu from the padata cpumask
+ * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ *                     padata cpumasks.
 *
 * @pinst: padata instance
 * @cpu: cpu to remove
+ * @mask: bitmask specifying from which cpumask @cpu should be removed
+ *        The @mask may be any combination of the following flags:
+ *          PADATA_CPU_SERIAL   - serial cpumask
+ *          PADATA_CPU_PARALLEL - parallel cpumask
 */
-int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
 {
        int err;
+        if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+                return -EINVAL;
        mutex_lock(&pinst->lock);
        get_online_cpus();
-        cpumask_clear_cpu(cpu, pinst->cpumask);
+        if (mask & PADATA_CPU_SERIAL)
+                cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
+        if (mask & PADATA_CPU_PARALLEL)
+                cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
        err = __padata_remove_cpu(pinst, cpu);
        put_online_cpus();
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
 *
 * @pinst: padata instance to start
 */
-void padata_start(struct padata_instance *pinst)
+int padata_start(struct padata_instance *pinst)
 {
+        int err = 0;
        mutex_lock(&pinst->lock);
-        pinst->flags |= PADATA_INIT;
+        if (pinst->flags & PADATA_INVALID)
+                err =-EINVAL;
+         __padata_start(pinst);
        mutex_unlock(&pinst->lock);
+        return err;
 }
 EXPORT_SYMBOL(padata_start);
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start);
 void padata_stop(struct padata_instance *pinst)
 {
        mutex_lock(&pinst->lock);
-        pinst->flags &= ~PADATA_INIT;
+        __padata_stop(pinst);
        mutex_unlock(&pinst->lock);
 }
 EXPORT_SYMBOL(padata_stop);
 #ifdef CONFIG_HOTPLUG_CPU
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
+{
+        return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+                cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
+}
 static int padata_cpu_callback(struct notifier_block *nfb,
                               unsigned long action, void *hcpu)
 {
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                if (!pinst_has_cpu(pinst, cpu))
                        break;
                mutex_lock(&pinst->lock);
                err = __padata_add_cpu(pinst, cpu);
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
-                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                if (!pinst_has_cpu(pinst, cpu))
                        break;
                mutex_lock(&pinst->lock);
                err = __padata_remove_cpu(pinst, cpu);
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                if (!pinst_has_cpu(pinst, cpu))
                        break;
                mutex_lock(&pinst->lock);
                __padata_remove_cpu(pinst, cpu);
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
-                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                if (!pinst_has_cpu(pinst, cpu))
                        break;
                mutex_lock(&pinst->lock);
                __padata_add_cpu(pinst, cpu);
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 }
 #endif
+static void __padata_free(struct padata_instance *pinst)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+        padata_stop(pinst);
+        padata_free_pd(pinst->pd);
+        free_cpumask_var(pinst->cpumask.pcpu);
+        free_cpumask_var(pinst->cpumask.cbcpu);
+        kfree(pinst);
+}
+#define kobj2pinst(_kobj)                                       \
+        container_of(_kobj, struct padata_instance, kobj)
+#define attr2pentry(_attr)                                      \
+        container_of(_attr, struct padata_sysfs_entry, attr)
+static void padata_sysfs_release(struct kobject *kobj)
+{
+        struct padata_instance *pinst = kobj2pinst(kobj);
+        __padata_free(pinst);
+}
+struct padata_sysfs_entry {
+        struct attribute attr;
+        ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
+        ssize_t (*store)(struct padata_instance *, struct attribute *,
+                         const char *, size_t);
+};
+static ssize_t show_cpumask(struct padata_instance *pinst,
+                            struct attribute *attr,  char *buf)
+{
+        struct cpumask *cpumask;
+        ssize_t len;
+        mutex_lock(&pinst->lock);
+        if (!strcmp(attr->name, "serial_cpumask"))
+                cpumask = pinst->cpumask.cbcpu;
+        else
+                cpumask = pinst->cpumask.pcpu;
+        len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
+                               nr_cpu_ids);
+        if (PAGE_SIZE - len < 2)
+                len = -EINVAL;
+        else
+                len += sprintf(buf + len, "\n");
+        mutex_unlock(&pinst->lock);
+        return len;
+}
+static ssize_t store_cpumask(struct padata_instance *pinst,
+                             struct attribute *attr,
+                             const char *buf, size_t count)
+{
+        cpumask_var_t new_cpumask;
+        ssize_t ret;
+        int mask_type;
+        if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
+                return -ENOMEM;
+        ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
+                           nr_cpumask_bits);
+        if (ret < 0)
+                goto out;
+        mask_type = !strcmp(attr->name, "serial_cpumask") ?
+                PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
+        ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
+        if (!ret)
+                ret = count;
+out:
+        free_cpumask_var(new_cpumask);
+        return ret;
+}
+#define PADATA_ATTR_RW(_name, _show_name, _store_name)          \
+        static struct padata_sysfs_entry _name##_attr =         \
+                __ATTR(_name, 0644, _show_name, _store_name)
+#define PADATA_ATTR_RO(_name, _show_name)               \
+        static struct padata_sysfs_entry _name##_attr = \
+                __ATTR(_name, 0400, _show_name, NULL)
+PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
+PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
+/*
+ * Padata sysfs provides the following objects:
+ * serial_cpumask   [RW] - cpumask for serial workers
+ * parallel_cpumask [RW] - cpumask for parallel workers
+ */
+static struct attribute *padata_default_attrs[] = {
+        &serial_cpumask_attr.attr,
+        &parallel_cpumask_attr.attr,
+        NULL,
+};
+static ssize_t padata_sysfs_show(struct kobject *kobj,
+                                 struct attribute *attr, char *buf)
+{
+        struct padata_instance *pinst;
+        struct padata_sysfs_entry *pentry;
+        ssize_t ret = -EIO;
+        pinst = kobj2pinst(kobj);
+        pentry = attr2pentry(attr);
+        if (pentry->show)
+                ret = pentry->show(pinst, attr, buf);
+        return ret;
+}
+static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
+                                  const char *buf, size_t count)
+{
+        struct padata_instance *pinst;
+        struct padata_sysfs_entry *pentry;
+        ssize_t ret = -EIO;
+        pinst = kobj2pinst(kobj);
+        pentry = attr2pentry(attr);
+        if (pentry->show)
+                ret = pentry->store(pinst, attr, buf, count);
+        return ret;
+}
+static const struct sysfs_ops padata_sysfs_ops = {
+        .show = padata_sysfs_show,
+        .store = padata_sysfs_store,
+};
+static struct kobj_type padata_attr_type = {
+        .sysfs_ops = &padata_sysfs_ops,
+        .default_attrs = padata_default_attrs,
+        .release = padata_sysfs_release,
+};
 /**
- * padata_alloc - allocate and initialize a padata instance
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ *                         Use the cpu_possible_mask for serial and
+ *                         parallel workers.
 *
- * @cpumask: cpumask that padata uses for parallelization
 * @wq: workqueue to use for the allocated padata instance
 */
-struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
-                                     struct workqueue_struct *wq)
+{
+        return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+/**
+ * padata_alloc - allocate and initialize a padata instance and specify
+ *                cpumasks for serial and parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ * @pcpumask: cpumask that will be used for padata parallelization
+ * @cbcpumask: cpumask that will be used for padata serialization
+ */
+struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+                                     const struct cpumask *pcpumask,
+                                     const struct cpumask *cbcpumask)
 {
        struct padata_instance *pinst;
-        struct parallel_data *pd;
+        struct parallel_data *pd = NULL;
        pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
        if (!pinst)
                goto err;
        get_online_cpus();
+        if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
-        pd = padata_alloc_pd(pinst, cpumask);
-        if (!pd)
                goto err_free_inst;
+        if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
+                free_cpumask_var(pinst->cpumask.pcpu);
+                goto err_free_inst;
+        }
+        if (!padata_validate_cpumask(pinst, pcpumask) ||
+            !padata_validate_cpumask(pinst, cbcpumask))
+                goto err_free_masks;
-        if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
+        pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
-                goto err_free_pd;
+        if (!pd)
+                goto err_free_masks;
        rcu_assign_pointer(pinst->pd, pd);
        pinst->wq = wq;
-        cpumask_copy(pinst->cpumask, cpumask);
+        cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+        cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
        pinst->flags = 0;
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
        put_online_cpus();
+        BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
+        kobject_init(&pinst->kobj, &padata_attr_type);
        mutex_init(&pinst->lock);
        return pinst;
-err_free_pd:
+err_free_masks:
-        padata_free_pd(pd);
+        free_cpumask_var(pinst->cpumask.pcpu);
+        free_cpumask_var(pinst->cpumask.cbcpu);
 err_free_inst:
        kfree(pinst);
        put_online_cpus();
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc);
 */
 void padata_free(struct padata_instance *pinst)
 {
-        padata_stop(pinst);
+        kobject_put(&pinst->kobj);
-        synchronize_rcu();
-#ifdef CONFIG_HOTPLUG_CPU
-        unregister_hotcpu_notifier(&pinst->cpu_notifier);
-#endif
-        get_online_cpus();
-        padata_flush_queues(pinst->pd);
-        put_online_cpus();
-        padata_free_pd(pinst->pd);
-        free_cpumask_var(pinst->cpumask);
-        kfree(pinst);
 }
 EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 3b16cd93fa7d..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
 #include <linux/nmi.h>
 #include <linux/dmi.h>
+#define PANIC_TIMER_STEP 100
+#define PANIC_BLINK_SPD 18
 int panic_on_oops;
 static unsigned long tainted_mask;
 static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
-/* Returns how long it waited in ms */
+static long no_blink(int state)
-long (*panic_blink)(long time);
-EXPORT_SYMBOL(panic_blink);
-static void panic_blink_one_second(void)
 {
-        static long i = 0, end;
+        return 0;
-        if (panic_blink) {
-                end = i + MSEC_PER_SEC;
-                while (i < end) {
-                        i += panic_blink(i);
-                        mdelay(1);
-                        i++;
-                }
-        } else {
-                /*
-                 * When running under a hypervisor a small mdelay may get
-                 * rounded up to the hypervisor timeslice. For example, with
-                 * a 1ms in 10ms hypervisor timeslice we might inflate a
-                 * mdelay(1) loop by 10x.
-                 *
-                 * If we have nothing to blink, spin on 1 second calls to
-                 * mdelay to avoid this.
-                 */
-                mdelay(MSEC_PER_SEC);
-        }
 }
+/* Returns how long it waited in ms */
+long (*panic_blink)(int state);
+EXPORT_SYMBOL(panic_blink);
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 {
        static char buf[1024];
        va_list args;
-        long i;
+        long i, i_next = 0;
+        int state = 0;
        /*
         * It's possible to come here directly from a panic-assertion and
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
        bust_spinlocks(0);
+        if (!panic_blink)
+                panic_blink = no_blink;
        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
                 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
-                for (i = 0; i < panic_timeout; i++) {
+                for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
                        touch_nmi_watchdog();
-                        panic_blink_one_second();
+                        if (i >= i_next) {
+                                i += panic_blink(state ^= 1);
+                                i_next = i + 3600 / PANIC_BLINK_SPD;
+                        }
+                        mdelay(PANIC_TIMER_STEP);
                }
                /*
                 * This will not be a clean reboot, with everything
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
        }
 #endif
        local_irq_enable();
-        while (1) {
+        for (i = 0; ; i += PANIC_TIMER_STEP) {
                touch_softlockup_watchdog();
-                panic_blink_one_second();
+                if (i >= i_next) {
+                        i += panic_blink(state ^= 1);
+                        i_next = i + 3600 / PANIC_BLINK_SPD;
+                }
+                mdelay(PANIC_TIMER_STEP);
        }
 }
@@ -344,7 +338,7 @@ static int init_oops_id(void)
 }
 late_initcall(init_oops_id);
-static void print_oops_end_marker(void)
+void print_oops_end_marker(void)
 {
        init_oops_id();
        printk(KERN_WARNING "---[ end trace %016llx ]---\n",
diff --git a/kernel/params.c b/kernel/params.c
index 0b30ecd53a52..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,6 +31,42 @@
 #define DEBUGP(fmt, a...)
 #endif
+/* Protects all parameters, and incidentally kmalloced_param list. */
+static DEFINE_MUTEX(param_lock);
+/* This just allows us to keep track of which parameters are kmalloced. */
+struct kmalloced_param {
+        struct list_head list;
+        char val[];
+};
+static LIST_HEAD(kmalloced_params);
+static void *kmalloc_parameter(unsigned int size)
+{
+        struct kmalloced_param *p;
+        p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+        if (!p)
+                return NULL;
+        list_add(&p->list, &kmalloced_params);
+        return p->val;
+}
+/* Does nothing if parameter wasn't kmalloced above. */
+static void maybe_kfree_parameter(void *param)
+{
+        struct kmalloced_param *p;
+        list_for_each_entry(p, &kmalloced_params, list) {
+                if (p->val == param) {
+                        list_del(&p->list);
+                        kfree(p);
+                        break;
+                }
+        }
+}
 static inline char dash2underscore(char c)
 {
        if (c == '-')
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
 static int parse_one(char *param,
                     char *val,
-                     struct kernel_param *params, 
+                     const struct kernel_param *params,
                     unsigned num_params,
                     int (*handle_unknown)(char *param, char *val))
 {
        unsigned int i;
+        int err;
        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
+                        /* Noone handled NULL, so do it here. */
+                        if (!val && params[i].ops->set != param_set_bool)
+                                return -EINVAL;
                        DEBUGP("They are equal!  Calling %p\n",
-                               params[i].set);
+                               params[i].ops->set);
-                        return params[i].set(val, &params[i]);
+                        mutex_lock(&param_lock);
+                        err = params[i].ops->set(val, &params[i]);
+                        mutex_unlock(&param_lock);
+                        return err;
                }
        }
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 int parse_args(const char *name,
               char *args,
-               struct kernel_param *params,
+               const struct kernel_param *params,
               unsigned num,
               int (*unknown)(char *param, char *val))
 {
@@ -176,22 +219,29 @@ int parse_args(const char *name,
 /* Lazy bastard, eh? */
 #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)       \
-        int param_set_##name(const char *val, struct kernel_param *kp)  \
+        int param_set_##name(const char *val, const struct kernel_param *kp) \
        {                                                               \
                tmptype l;                                              \
                int ret;                                                \
                                                                        \
-                if (!val) return -EINVAL;                               \
                ret = strtolfn(val, 0, &l);                             \
                if (ret == -EINVAL || ((type)l != l))                   \
                        return -EINVAL;                                 \
                *((type *)kp->arg) = l;                                 \
                return 0;                                               \
        }                                                               \
-        int param_get_##name(char *buffer, struct kernel_param *kp)     \
+        int param_get_##name(char *buffer, const struct kernel_param *kp) \
        {                                                               \
                return sprintf(buffer, format, *((type *)kp->arg));     \
-        }
+        }                                                               \
+        struct kernel_param_ops param_ops_##name = {                    \
+                .set = param_set_##name,                                \
+                .get = param_get_##name,                                \
+        };                                                              \
+        EXPORT_SYMBOL(param_set_##name);                                \
+        EXPORT_SYMBOL(param_get_##name);                                \
+        EXPORT_SYMBOL(param_ops_##name)
 STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
 STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
 STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
 STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
-int param_set_charp(const char *val, struct kernel_param *kp)
+int param_set_charp(const char *val, const struct kernel_param *kp)
 {
-        if (!val) {
-                printk(KERN_ERR "%s: string parameter expected\n",
-                       kp->name);
-                return -EINVAL;
-        }
        if (strlen(val) > 1024) {
                printk(KERN_ERR "%s: string parameter too long\n",
                       kp->name);
                return -ENOSPC;
        }
-        /* This is a hack.  We can't need to strdup in early boot, and we
+        maybe_kfree_parameter(*(char **)kp->arg);
+        /* This is a hack.  We can't kmalloc in early boot, and we
         * don't need to; this mangled commandline is preserved. */
        if (slab_is_available()) {
-                *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+                *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
                if (!*(char **)kp->arg)
                        return -ENOMEM;
+                strcpy(*(char **)kp->arg, val);
        } else
                *(const char **)kp->arg = val;
        return 0;
 }
+EXPORT_SYMBOL(param_set_charp);
-int param_get_charp(char *buffer, struct kernel_param *kp)
+int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
        return sprintf(buffer, "%s", *((char **)kp->arg));
 }
+EXPORT_SYMBOL(param_get_charp);
+static void param_free_charp(void *arg)
+{
+        maybe_kfree_parameter(*((char **)arg));
+}
+struct kernel_param_ops param_ops_charp = {
+        .set = param_set_charp,
+        .get = param_get_charp,
+        .free = param_free_charp,
+};
+EXPORT_SYMBOL(param_ops_charp);
 /* Actually could be a bool or an int, for historical reasons. */
-int param_set_bool(const char *val, struct kernel_param *kp)
+int param_set_bool(const char *val, const struct kernel_param *kp)
 {
        bool v;
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
                *(int *)kp->arg = v;
        return 0;
 }
+EXPORT_SYMBOL(param_set_bool);
-int param_get_bool(char *buffer, struct kernel_param *kp)
+int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
        bool val;
        if (kp->flags & KPARAM_ISBOOL)
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
        /* Y and N chosen as being relatively non-coder friendly */
        return sprintf(buffer, "%c", val ? 'Y' : 'N');
 }
+EXPORT_SYMBOL(param_get_bool);
+struct kernel_param_ops param_ops_bool = {
+        .set = param_set_bool,
+        .get = param_get_bool,
+};
+EXPORT_SYMBOL(param_ops_bool);
 /* This one must be bool. */
-int param_set_invbool(const char *val, struct kernel_param *kp)
+int param_set_invbool(const char *val, const struct kernel_param *kp)
 {
        int ret;
        bool boolval;
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
                *(bool *)kp->arg = !boolval;
        return ret;
 }
+EXPORT_SYMBOL(param_set_invbool);
-int param_get_invbool(char *buffer, struct kernel_param *kp)
+int param_get_invbool(char *buffer, const struct kernel_param *kp)
 {
        return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
+EXPORT_SYMBOL(param_get_invbool);
+struct kernel_param_ops param_ops_invbool = {
+        .set = param_set_invbool,
+        .get = param_get_invbool,
+};
+EXPORT_SYMBOL(param_ops_invbool);
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
                       const char *val,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
-                       int (*set)(const char *, struct kernel_param *kp),
+                       int (*set)(const char *, const struct kernel_param *kp),
                       u16 flags,
                       unsigned int *num)
 {
@@ -309,12 +386,6 @@ static int param_array(const char *name,
        kp.arg = elem;
        kp.flags = flags;
-        /* No equals sign? */
-        if (!val) {
-                printk(KERN_ERR "%s: expects arguments\n", name);
-                return -EINVAL;
-        }
        *num = 0;
        /* We expect a comma-separated list of values. */
        do {
@@ -330,6 +401,7 @@ static int param_array(const char *name,
                /* nul-terminate and parse */
                save = val[len];
                ((char *)val)[len] = '\0';
+                BUG_ON(!mutex_is_locked(&param_lock));
                ret = set(val, &kp);
                if (ret != 0)
@@ -347,17 +419,17 @@ static int param_array(const char *name,
        return 0;
 }
-int param_array_set(const char *val, struct kernel_param *kp)
+static int param_array_set(const char *val, const struct kernel_param *kp)
 {
        const struct kparam_array *arr = kp->arr;
        unsigned int temp_num;
        return param_array(kp->name, val, 1, arr->max, arr->elem,
-                           arr->elemsize, arr->set, kp->flags,
+                           arr->elemsize, arr->ops->set, kp->flags,
                           arr->num ?: &temp_num);
 }
-int param_array_get(char *buffer, struct kernel_param *kp)
+static int param_array_get(char *buffer, const struct kernel_param *kp)
 {
        int i, off, ret;
        const struct kparam_array *arr = kp->arr;
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
                if (i)
                        buffer[off++] = ',';
                p.arg = arr->elem + arr->elemsize * i;
-                ret = arr->get(buffer + off, &p);
+                BUG_ON(!mutex_is_locked(&param_lock));
+                ret = arr->ops->get(buffer + off, &p);
                if (ret < 0)
                        return ret;
                off += ret;
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
        return off;
 }
-int param_set_copystring(const char *val, struct kernel_param *kp)
+static void param_array_free(void *arg)
+{
+        unsigned int i;
+        const struct kparam_array *arr = arg;
+        if (arr->ops->free)
+                for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
+                        arr->ops->free(arr->elem + arr->elemsize * i);
+}
+struct kernel_param_ops param_array_ops = {
+        .set = param_array_set,
+        .get = param_array_get,
+        .free = param_array_free,
+};
+EXPORT_SYMBOL(param_array_ops);
+int param_set_copystring(const char *val, const struct kernel_param *kp)
 {
        const struct kparam_string *kps = kp->str;
-        if (!val) {
-                printk(KERN_ERR "%s: missing param set value\n", kp->name);
-                return -EINVAL;
-        }
        if (strlen(val)+1 > kps->maxlen) {
                printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
                       kp->name, kps->maxlen-1);
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
        strcpy(kps->string, val);
        return 0;
 }
+EXPORT_SYMBOL(param_set_copystring);
-int param_get_string(char *buffer, struct kernel_param *kp)
+int param_get_string(char *buffer, const struct kernel_param *kp)
 {
        const struct kparam_string *kps = kp->str;
        return strlcpy(buffer, kps->string, kps->maxlen);
 }
+EXPORT_SYMBOL(param_get_string);
+struct kernel_param_ops param_ops_string = {
+        .set = param_set_copystring,
+        .get = param_get_string,
+};
+EXPORT_SYMBOL(param_ops_string);
 /* sysfs output in /sys/modules/XYZ/parameters/ */
 #define to_module_attr(n) container_of(n, struct module_attribute, attr)
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[];
 struct param_attribute
 {
        struct module_attribute mattr;
-        struct kernel_param *param;
+        const struct kernel_param *param;
 };
 struct module_param_attrs
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
        int count;
        struct param_attribute *attribute = to_param_attr(mattr);
-        if (!attribute->param->get)
+        if (!attribute->param->ops->get)
                return -EPERM;
-        count = attribute->param->get(buf, attribute->param);
+        mutex_lock(&param_lock);
+        count = attribute->param->ops->get(buf, attribute->param);
+        mutex_unlock(&param_lock);
        if (count > 0) {
                strcat(buf, "\n");
                ++count;
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
        int err;
        struct param_attribute *attribute = to_param_attr(mattr);
-        if (!attribute->param->set)
+        if (!attribute->param->ops->set)
                return -EPERM;
-        err = attribute->param->set(buf, attribute->param);
+        mutex_lock(&param_lock);
+        err = attribute->param->ops->set(buf, attribute->param);
+        mutex_unlock(&param_lock);
        if (!err)
                return len;
        return err;
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #endif
 #ifdef CONFIG_SYSFS
+void __kernel_param_lock(void)
+{
+        mutex_lock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_lock);
+void __kernel_param_unlock(void)
+{
+        mutex_unlock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_unlock);
 /*
 * add_sysfs_param - add a parameter to sysfs
 * @mk: struct module_kobject
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 * if there's an error.
 */
 static __modinit int add_sysfs_param(struct module_kobject *mk,
-                                     struct kernel_param *kp,
+                                     const struct kernel_param *kp,
                                     const char *name)
 {
        struct module_param_attrs *new;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
 * /sys/module/[mod->name]/parameters/
 */
 int module_param_sysfs_setup(struct module *mod,
-                             struct kernel_param *kparam,
+                             const struct kernel_param *kparam,
                             unsigned int num_params)
 {
        int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
 void destroy_params(const struct kernel_param *params, unsigned num)
 {
-        /* FIXME: This should free kmalloced charp parameters.  It doesn't. */
+        unsigned int i;
+        for (i = 0; i < num; i++)
+                if (params[i].ops->free)
+                        params[i].ops->free(params[i].arg);
 }
 static void __init kernel_add_sysfs_param(const char *name,
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
 subsys_initcall(param_sysfs_init);
 #endif /* CONFIG_SYSFS */
-EXPORT_SYMBOL(param_set_byte);
-EXPORT_SYMBOL(param_get_byte);
-EXPORT_SYMBOL(param_set_short);
-EXPORT_SYMBOL(param_get_short);
-EXPORT_SYMBOL(param_set_ushort);
-EXPORT_SYMBOL(param_get_ushort);
-EXPORT_SYMBOL(param_set_int);
-EXPORT_SYMBOL(param_get_int);
-EXPORT_SYMBOL(param_set_uint);
-EXPORT_SYMBOL(param_get_uint);
-EXPORT_SYMBOL(param_set_long);
-EXPORT_SYMBOL(param_get_long);
-EXPORT_SYMBOL(param_set_ulong);
-EXPORT_SYMBOL(param_get_ulong);
-EXPORT_SYMBOL(param_set_charp);
-EXPORT_SYMBOL(param_get_charp);
-EXPORT_SYMBOL(param_set_bool);
-EXPORT_SYMBOL(param_get_bool);
-EXPORT_SYMBOL(param_set_invbool);
-EXPORT_SYMBOL(param_get_invbool);
-EXPORT_SYMBOL(param_array_set);
-EXPORT_SYMBOL(param_array_get);
-EXPORT_SYMBOL(param_set_copystring);
-EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
-/*
+atomic_t perf_task_events __read_mostly;
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-static atomic_t nr_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
 /*
 * perf event paranoia level:
 *  -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 static atomic64_t perf_event_id;
-/*
+void __weak perf_event_print_debug(void)        { }
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
-/*
+extern __weak const char *perf_pmu_name(void)
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 {
-        return NULL;
+        return "pmu";
 }
-void __weak hw_perf_disable(void)               { barrier(); }
+void perf_pmu_disable(struct pmu *pmu)
-void __weak hw_perf_enable(void)                { barrier(); }
+{
+        int *count = this_cpu_ptr(pmu->pmu_disable_count);
-void __weak perf_event_print_debug(void)        { }
+        if (!(*count)++)
+                pmu->pmu_disable(pmu);
-static DEFINE_PER_CPU(int, perf_disable_count);
+}
-void perf_disable(void)
+void perf_pmu_enable(struct pmu *pmu)
 {
-        if (!__get_cpu_var(perf_disable_count)++)
+        int *count = this_cpu_ptr(pmu->pmu_disable_count);
-                hw_perf_disable();
+        if (!--(*count))
+                pmu->pmu_enable(pmu);
 }
-void perf_enable(void)
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
 {
-        if (!--__get_cpu_var(perf_disable_count))
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                hw_perf_enable();
+        struct list_head *head = &__get_cpu_var(rotation_list);
+        WARN_ON(!irqs_disabled());
+        if (list_empty(&cpuctx->rotation_list))
+                list_add(&cpuctx->rotation_list, head);
 }
 static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
 * the context could get moved to another task.
 */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
        struct perf_event_context *ctx;
        rcu_read_lock();
- retry:
+retry:
-        ctx = rcu_dereference(task->perf_event_ctxp);
+        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                 * can't get swapped on us any more.
                 */
                raw_spin_lock_irqsave(&ctx->lock, *flags);
-                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        goto retry;
                }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
 {
        struct perf_event_context *ctx;
        unsigned long flags;
-        ctx = perf_lock_task_context(task, &flags);
+        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -214,7 +216,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 static inline u64 perf_clock(void)
 {
-        return cpu_clock(raw_smp_processor_id());
+        return local_clock();
 }
 /*
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        }
        list_add_rcu(&event->event_entry, &ctx->event_list);
+        if (!ctx->nr_events)
+                perf_pmu_rotate_start(ctx->pmu);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
 {
        struct perf_event *group_leader = event->group_leader;
-        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+        /*
+         * We can have double attach due to group movement in perf_event_open.
+         */
+        if (event->attach_state & PERF_ATTACH_GROUP)
+                return;
        event->attach_state |= PERF_ATTACH_GROUP;
        if (group_leader == event)
@@ -402,11 +411,31 @@ static void perf_group_detach(struct perf_event *event)
        }
 }
+static inline int
+event_filter_match(struct perf_event *event)
+{
+        return event->cpu == -1 || event->cpu == smp_processor_id();
+}
 static void
 event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
+        u64 delta;
+        /*
+         * An event which could not be activated because of
+         * filter mismatch still needs to have its timings
+         * maintained, otherwise bogus information is return
+         * via read() for time_enabled, time_running:
+         */
+        if (event->state == PERF_EVENT_STATE_INACTIVE
+            && !event_filter_match(event)) {
+                delta = ctx->time - event->tstamp_stopped;
+                event->tstamp_running += delta;
+                event->tstamp_stopped = ctx->time;
+        }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
@@ -416,7 +445,7 @@ event_sched_out(struct perf_event *event,
                event->state = PERF_EVENT_STATE_OFF;
        }
        event->tstamp_stopped = ctx->time;
-        event->pmu->disable(event);
+        event->pmu->del(event, 0);
        event->oncpu = -1;
        if (!is_software_event(event))
@@ -432,9 +461,7 @@ group_sched_out(struct perf_event *group_event,
                struct perf_event_context *ctx)
 {
        struct perf_event *event;
+        int state = group_event->state;
-        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
-                return;
        event_sched_out(group_event, cpuctx, ctx);
@@ -444,10 +471,16 @@ group_sched_out(struct perf_event *group_event,
        list_for_each_entry(event, &group_event->sibling_list, group_entry)
                event_sched_out(event, cpuctx, ctx);
-        if (group_event->attr.exclusive)
+        if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
                cpuctx->exclusive = 0;
 }
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
 /*
 * Cross CPU call to remove a performance event
 *
@@ -456,9 +489,9 @@ group_sched_out(struct perf_event *group_event,
 */
 static void __perf_event_remove_from_context(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a task context, we need to check whether it is
@@ -469,27 +502,11 @@ static void __perf_event_remove_from_context(void *info)
                return;
        raw_spin_lock(&ctx->lock);
-        /*
-         * Protect the list operation against NMI by disabling the
-         * events on a global level.
-         */
-        perf_disable();
        event_sched_out(event, cpuctx, ctx);
        list_del_event(event, ctx);
-        if (!ctx->task) {
-                /*
-                 * Allow more per task events with respect to the
-                 * reservation:
-                 */
-                cpuctx->max_pertask =
-                        min(perf_max_events - ctx->nr_events,
-                            perf_max_events - perf_reserved_percpu);
-        }
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
@@ -554,8 +571,8 @@ retry:
 static void __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a per-task event, need to check whether this
@@ -610,7 +627,7 @@ void perf_event_disable(struct perf_event *event)
                return;
        }
- retry:
+retry:
        task_oncpu_function_call(task, __perf_event_disable, event);
        raw_spin_lock_irq(&ctx->lock);
@@ -649,7 +666,7 @@ event_sched_in(struct perf_event *event,
         */
        smp_wmb();
-        if (event->pmu->enable(event)) {
+        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
                return -EAGAIN;
@@ -673,23 +690,17 @@ group_sched_in(struct perf_event *group_event,
               struct perf_event_context *ctx)
 {
        struct perf_event *event, *partial_group = NULL;
-        const struct pmu *pmu = group_event->pmu;
+        struct pmu *pmu = group_event->pmu;
-        bool txn = false;
+        u64 now = ctx->time;
-        int ret;
+        bool simulate = false;
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        /* Check if group transaction availabe */
+        pmu->start_txn(pmu);
-        if (pmu->start_txn)
-                txn = true;
-        if (txn)
-                pmu->start_txn(pmu);
        if (event_sched_in(group_event, cpuctx, ctx)) {
-                if (txn)
+                pmu->cancel_txn(pmu);
-                        pmu->cancel_txn(pmu);
                return -EAGAIN;
        }
@@ -703,29 +714,38 @@ group_sched_in(struct perf_event *group_event,
                }
        }
-        if (!txn)
+        if (!pmu->commit_txn(pmu))
                return 0;
-        ret = pmu->commit_txn(pmu);
-        if (!ret) {
-                pmu->cancel_txn(pmu);
-                return 0;
-        }
 group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
+         * The events up to the failed event are scheduled out normally,
+         * tstamp_stopped will be updated.
+         *
+         * The failed events and the remaining siblings need to have
+         * their timings updated as if they had gone thru event_sched_in()
+         * and event_sched_out(). This is required to get consistent timings
+         * across the group. This also takes care of the case where the group
+         * could never be scheduled by ensuring tstamp_stopped is set to mark
+         * the time the event was actually stopped, such that time delta
+         * calculation in update_event_times() is correct.
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                if (event == partial_group)
-                        break;
+                        simulate = true;
-                event_sched_out(event, cpuctx, ctx);
+                if (simulate) {
+                        event->tstamp_running += now - event->tstamp_stopped;
+                        event->tstamp_stopped = now;
+                } else {
+                        event_sched_out(event, cpuctx, ctx);
+                }
        }
        event_sched_out(group_event, cpuctx, ctx);
-        if (txn)
+        pmu->cancel_txn(pmu);
-                pmu->cancel_txn(pmu);
        return -EAGAIN;
 }
@@ -778,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event,
 */
 static void __perf_install_in_context(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
        /*
@@ -801,12 +821,6 @@ static void __perf_install_in_context(void *info)
        ctx->is_active = 1;
        update_context_time(ctx);
-        /*
-         * Protect the list operation against NMI by disabling the
-         * events on a global level. NOP for non NMI based events.
-         */
-        perf_disable();
        add_event_to_ctx(event, ctx);
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -844,12 +858,7 @@ static void __perf_install_in_context(void *info)
                }
        }
-        if (!err && !ctx->task && cpuctx->max_pertask)
+unlock:
-                cpuctx->max_pertask--;
- unlock:
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
@@ -872,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
        struct task_struct *task = ctx->task;
+        event->ctx = ctx;
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
@@ -920,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
        event->state = PERF_EVENT_STATE_INACTIVE;
        event->tstamp_enabled = ctx->time - event->total_time_enabled;
-        list_for_each_entry(sub, &event->sibling_list, group_entry)
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
-                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
                        sub->tstamp_enabled =
                                ctx->time - sub->total_time_enabled;
+                }
+        }
 }
 /*
@@ -932,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 static void __perf_event_enable(void *info)
 {
        struct perf_event *event = info;
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
        /*
@@ -968,12 +981,10 @@ static void __perf_event_enable(void *info)
        if (!group_can_go_on(event, cpuctx, 1)) {
                err = -EEXIST;
        } else {
-                perf_disable();
                if (event == leader)
                        err = group_sched_in(event, cpuctx, ctx);
                else
                        err = event_sched_in(event, cpuctx, ctx);
-                perf_enable();
        }
        if (err) {
@@ -989,7 +1000,7 @@ static void __perf_event_enable(void *info)
                }
        }
- unlock:
+unlock:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1030,7 +1041,7 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_ERROR)
                event->state = PERF_EVENT_STATE_OFF;
- retry:
+retry:
        raw_spin_unlock_irq(&ctx->lock);
        task_oncpu_function_call(task, __perf_event_enable, event);
@@ -1050,7 +1061,7 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_OFF)
                __perf_event_mark_enabled(event, ctx);
- out:
+out:
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -1081,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        struct perf_event *event;
        raw_spin_lock(&ctx->lock);
+        perf_pmu_disable(ctx->pmu);
        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
                goto out;
        update_context_time(ctx);
-        perf_disable();
        if (!ctx->nr_active)
-                goto out_enable;
+                goto out;
-        if (event_type & EVENT_PINNED)
+        if (event_type & EVENT_PINNED) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
-        if (event_type & EVENT_FLEXIBLE)
+        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
- out_enable:
+out:
-        perf_enable();
+        perf_pmu_enable(ctx->pmu);
- out:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1155,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
-        value = atomic64_read(&next_event->count);
+        value = local64_read(&next_event->count);
-        value = atomic64_xchg(&event->count, value);
+        value = local64_xchg(&event->count, value);
-        atomic64_set(&next_event->count, value);
+        local64_set(&next_event->count, value);
        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);
@@ -1198,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        }
 }
-/*
+void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- * Called from scheduler to remove the events of the current task,
+                                  struct task_struct *next)
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-                                 struct task_struct *next)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
-        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
+        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+        if (likely(!ctx))
+                return;
-        if (likely(!ctx || !cpuctx->task_ctx))
+        cpuctx = __get_cpu_context(ctx);
+        if (!cpuctx->task_ctx)
                return;
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
-        next_ctx = next->perf_event_ctxp;
+        next_ctx = next->perf_event_ctxp[ctxn];
        if (parent && next_ctx &&
            rcu_dereference(next_ctx->parent_ctx) == parent) {
                /*
@@ -1244,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                         * XXX do we need a memory barrier of sorts
                         * wrt to rcu_dereference() of perf_event_ctxp
                         */
-                        task->perf_event_ctxp = next_ctx;
+                        task->perf_event_ctxp[ctxn] = next_ctx;
-                        next->perf_event_ctxp = ctx;
+                        next->perf_event_ctxp[ctxn] = ctx;
                        ctx->task = next;
                        next_ctx->task = task;
                        do_switch = 0;
@@ -1263,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task,
        }
 }
+#define for_each_task_context_nr(ctxn)                                  \
+        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+                                 struct task_struct *next)
+{
+        int ctxn;
+        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+        for_each_task_context_nr(ctxn)
+                perf_event_context_sched_out(task, ctxn, next);
+}
 static void task_ctx_sched_out(struct perf_event_context *ctx,
                               enum event_type_t event_type)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        if (!cpuctx->task_ctx)
                return;
@@ -1281,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 /*
 * Called with IRQs disabled
 */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
-        task_ctx_sched_out(ctx, EVENT_ALL);
-}
-/*
- * Called with IRQs disabled
- */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type)
 {
@@ -1339,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
-                if (group_can_go_on(event, cpuctx, can_add_hw))
+                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
+                }
        }
 }
@@ -1357,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx,
        ctx->timestamp = perf_clock();
-        perf_disable();
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
@@ -1370,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx,
        if (event_type & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
-        perf_enable();
+out:
- out:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1383,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
        ctx_sched_in(ctx, cpuctx, event_type);
 }
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
                              enum event_type_t event_type)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx = task->perf_event_ctxp;
-        if (likely(!ctx))
+        cpuctx = __get_cpu_context(ctx);
-                return;
        if (cpuctx->task_ctx == ctx)
                return;
        ctx_sched_in(ctx, cpuctx, event_type);
        cpuctx->task_ctx = ctx;
 }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
-{
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-        struct perf_event_context *ctx = task->perf_event_ctxp;
-        if (likely(!ctx))
+void perf_event_context_sched_in(struct perf_event_context *ctx)
-                return;
+{
+        struct perf_cpu_context *cpuctx;
+        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx)
                return;
-        perf_disable();
+        perf_pmu_disable(ctx->pmu);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1433,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task)
        cpuctx->task_ctx = ctx;
-        perf_enable();
+        /*
+         * Since these rotations are per-cpu, we need to ensure the
+         * cpu-context we got scheduled on is actually rotating.
+         */
+        perf_pmu_rotate_start(ctx->pmu);
+        perf_pmu_enable(ctx->pmu);
+}
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *task)
+{
+        struct perf_event_context *ctx;
+        int ctxn;
+        for_each_task_context_nr(ctxn) {
+                ctx = task->perf_event_ctxp[ctxn];
+                if (likely(!ctx))
+                        continue;
+                perf_event_context_sched_in(ctx);
+        }
 }
 #define MAX_INTERRUPTS (~0ULL)
@@ -1513,22 +1545,6 @@ do {					\
        return div64_u64(dividend, divisor);
 }
-static void perf_event_stop(struct perf_event *event)
-{
-        if (!event->pmu->stop)
-                return event->pmu->disable(event);
-        return event->pmu->stop(event);
-}
-static int perf_event_start(struct perf_event *event)
-{
-        if (!event->pmu->start)
-                return event->pmu->enable(event);
-        return event->pmu->start(event);
-}
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -1547,16 +1563,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
        hwc->sample_period = sample_period;
-        if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+        if (local64_read(&hwc->period_left) > 8*sample_period) {
-                perf_disable();
+                event->pmu->stop(event, PERF_EF_UPDATE);
-                perf_event_stop(event);
+                local64_set(&hwc->period_left, 0);
-                atomic64_set(&hwc->period_left, 0);
+                event->pmu->start(event, PERF_EF_RELOAD);
-                perf_event_start(event);
-                perf_enable();
        }
 }
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
@@ -1581,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
-                        perf_disable();
+                        event->pmu->start(event, 0);
-                        event->pmu->unthrottle(event);
-                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                perf_disable();
                event->pmu->read(event);
-                now = atomic64_read(&event->count);
+                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;
                if (delta > 0)
-                        perf_adjust_period(event, TICK_NSEC, delta);
+                        perf_adjust_period(event, period, delta);
-                perf_enable();
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1615,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
        raw_spin_unlock(&ctx->lock);
 }
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
-        struct perf_cpu_context *cpuctx;
+        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
-        struct perf_event_context *ctx;
+        struct perf_event_context *ctx = NULL;
-        int rotate = 0;
+        int rotate = 0, remove = 1;
-        if (!atomic_read(&nr_events))
+        if (cpuctx->ctx.nr_events) {
-                return;
+                remove = 0;
+                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-        cpuctx = &__get_cpu_var(perf_cpu_context);
+                        rotate = 1;
-        if (cpuctx->ctx.nr_events &&
+        }
-            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-                rotate = 1;
-        ctx = curr->perf_event_ctxp;
+        ctx = cpuctx->task_ctx;
-        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+        if (ctx && ctx->nr_events) {
-                rotate = 1;
+                remove = 0;
+                if (ctx->nr_events != ctx->nr_active)
+                        rotate = 1;
+        }
-        perf_ctx_adjust_freq(&cpuctx->ctx);
+        perf_pmu_disable(cpuctx->ctx.pmu);
+        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
        if (ctx)
-                perf_ctx_adjust_freq(ctx);
+                perf_ctx_adjust_freq(ctx, interval);
        if (!rotate)
-                return;
+                goto done;
-        perf_disable();
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1651,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr)
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
-        perf_enable();
+done:
+        if (remove)
+                list_del_init(&cpuctx->rotation_list);
+        perf_pmu_enable(cpuctx->ctx.pmu);
+}
+void perf_event_task_tick(void)
+{
+        struct list_head *head = &__get_cpu_var(rotation_list);
+        struct perf_cpu_context *cpuctx, *tmp;
+        WARN_ON(!irqs_disabled());
+        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+                if (cpuctx->jiffies_interval == 1 ||
+                                !(jiffies % cpuctx->jiffies_interval))
+                        perf_rotate_context(cpuctx);
+        }
 }
 static int event_enable_on_exec(struct perf_event *event,
@@ -1674,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event,
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
-        struct perf_event_context *ctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
        int ret;
        local_irq_save(flags);
-        ctx = task->perf_event_ctxp;
        if (!ctx || !ctx->nr_events)
                goto out;
-        __perf_event_task_sched_out(ctx);
+        task_ctx_sched_out(ctx, EVENT_ALL);
        raw_spin_lock(&ctx->lock);
@@ -1711,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_unlock(&ctx->lock);
-        perf_event_task_sched_in(task);
+        perf_event_context_sched_in(ctx);
- out:
+out:
        local_irq_restore(flags);
 }
@@ -1721,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 */
 static void __perf_event_read(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a task context, we need to check whether it is
@@ -1743,6 +1776,11 @@ static void __perf_event_read(void *info)
        event->pmu->read(event);
 }
+static inline u64 perf_event_count(struct perf_event *event)
+{
+        return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
 static u64 perf_event_read(struct perf_event *event)
 {
        /*
@@ -1757,20 +1795,234 @@ static u64 perf_event_read(struct perf_event *event)
                unsigned long flags;
                raw_spin_lock_irqsave(&ctx->lock, flags);
-                update_context_time(ctx);
+                /*
+                 * may read while context is not active
+                 * (e.g., thread is blocked), in that case
+                 * we cannot update context time
+                 */
+                if (ctx->is_active)
+                        update_context_time(ctx);
                update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
-        return atomic64_read(&event->count);
+        return perf_event_count(event);
 }
 /*
- * Initialize the perf_event context in a task_struct:
+ * Callchain support
 */
+struct callchain_cpus_entries {
+        struct rcu_head                 rcu_head;
+        struct perf_callchain_entry     *cpu_entries[0];
+};
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
+{
+}
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                                struct pt_regs *regs)
+{
+}
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+        struct callchain_cpus_entries *entries;
+        int cpu;
+        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+}
+static void release_callchain_buffers(void)
+{
+        struct callchain_cpus_entries *entries;
+        entries = callchain_cpus_entries;
+        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+static int alloc_callchain_buffers(void)
+{
+        int cpu;
+        int size;
+        struct callchain_cpus_entries *entries;
+        /*
+         * We can't use the percpu allocation API for data that can be
+         * accessed from NMI. Use a temporary manual per cpu allocation
+         * until that gets sorted out.
+         */
+        size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+                num_possible_cpus();
+        entries = kzalloc(size, GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+        for_each_possible_cpu(cpu) {
+                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                         cpu_to_node(cpu));
+                if (!entries->cpu_entries[cpu])
+                        goto fail;
+        }
+        rcu_assign_pointer(callchain_cpus_entries, entries);
+        return 0;
+fail:
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+        return -ENOMEM;
+}
+static int get_callchain_buffers(void)
+{
+        int err = 0;
+        int count;
+        mutex_lock(&callchain_mutex);
+        count = atomic_inc_return(&nr_callchain_events);
+        if (WARN_ON_ONCE(count < 1)) {
+                err = -EINVAL;
+                goto exit;
+        }
+        if (count > 1) {
+                /* If the allocation failed, give up */
+                if (!callchain_cpus_entries)
+                        err = -ENOMEM;
+                goto exit;
+        }
+        err = alloc_callchain_buffers();
+        if (err)
+                release_callchain_buffers();
+exit:
+        mutex_unlock(&callchain_mutex);
+        return err;
+}
+static void put_callchain_buffers(void)
+{
+        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+                release_callchain_buffers();
+                mutex_unlock(&callchain_mutex);
+        }
+}
+static int get_recursion_context(int *recursion)
+{
+        int rctx;
+        if (in_nmi())
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (recursion[rctx])
+                return -1;
+        recursion[rctx]++;
+        barrier();
+        return rctx;
+}
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+        barrier();
+        recursion[rctx]--;
+}
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+        int cpu;
+        struct callchain_cpus_entries *entries;
+        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        if (*rctx == -1)
+                return NULL;
+        entries = rcu_dereference(callchain_cpus_entries);
+        if (!entries)
+                return NULL;
+        cpu = smp_processor_id();
+        return &entries->cpu_entries[cpu][*rctx];
+}
 static void
-__perf_event_init_context(struct perf_event_context *ctx,
+put_callchain_entry(int rctx)
-                            struct task_struct *task)
+{
+        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        int rctx;
+        struct perf_callchain_entry *entry;
+        entry = get_callchain_entry(&rctx);
+        if (rctx == -1)
+                return NULL;
+        if (!entry)
+                goto exit_put;
+        entry->nr = 0;
+        if (!user_mode(regs)) {
+                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                perf_callchain_kernel(entry, regs);
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                perf_callchain_user(entry, regs);
+        }
+exit_put:
+        put_callchain_entry(rctx);
+        return entry;
+}
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
@@ -1778,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
-        ctx->task = task;
 }
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
 {
        struct perf_event_context *ctx;
-        struct perf_cpu_context *cpuctx;
-        struct task_struct *task;
-        unsigned long flags;
-        int err;
-        if (pid == -1 && cpu != -1) {
+        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-                /* Must be root to operate on a CPU event: */
+        if (!ctx)
-                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                return NULL;
-                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu >= nr_cpumask_bits)
-                        return ERR_PTR(-EINVAL);
-                /*
+        __perf_event_init_context(ctx);
-                 * We could be clever and allow to attach a event to an
+        if (task) {
-                 * offline CPU and activate it when the CPU comes up, but
+                ctx->task = task;
-                 * that's for later.
+                get_task_struct(task);
-                 */
+        }
-                if (!cpu_online(cpu))
+        ctx->pmu = pmu;
-                        return ERR_PTR(-ENODEV);
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
+        return ctx;
-                ctx = &cpuctx->ctx;
+}
-                get_ctx(ctx);
-                return ctx;
+static struct task_struct *
-        }
+find_lively_task_by_vpid(pid_t vpid)
+{
+        struct task_struct *task;
+        int err;
        rcu_read_lock();
-        if (!pid)
+        if (!vpid)
                task = current;
        else
-                task = find_task_by_vpid(pid);
+                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
@@ -1836,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto errout;
- retry:
+        return task;
-        ctx = perf_lock_task_context(task, &flags);
+errout:
+        put_task_struct(task);
+        return ERR_PTR(err);
+}
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+        struct perf_event_context *ctx;
+        struct perf_cpu_context *cpuctx;
+        unsigned long flags;
+        int ctxn, err;
+        if (!task && cpu != -1) {
+                /* Must be root to operate on a CPU event: */
+                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                        return ERR_PTR(-EACCES);
+                if (cpu < 0 || cpu >= nr_cpumask_bits)
+                        return ERR_PTR(-EINVAL);
+                /*
+                 * We could be clever and allow to attach a event to an
+                 * offline CPU and activate it when the CPU comes up, but
+                 * that's for later.
+                 */
+                if (!cpu_online(cpu))
+                        return ERR_PTR(-ENODEV);
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                ctx = &cpuctx->ctx;
+                get_ctx(ctx);
+                return ctx;
+        }
+        err = -EINVAL;
+        ctxn = pmu->task_ctx_nr;
+        if (ctxn < 0)
+                goto errout;
+retry:
+        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                unclone_ctx(ctx);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        if (!ctx) {
-                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
-                __perf_event_init_context(ctx, task);
                get_ctx(ctx);
-                if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+                if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
                        /*
                         * We raced with some other task; use
                         * the context they set.
                         */
+                        put_task_struct(task);
                        kfree(ctx);
                        goto retry;
                }
-                get_task_struct(task);
        }
-        put_task_struct(task);
        return ctx;
- errout:
+errout:
-        put_task_struct(task);
        return ERR_PTR(err);
 }
@@ -1882,32 +2169,36 @@ static void free_event_rcu(struct rcu_head *head)
        kfree(event);
 }
-static void perf_pending_sync(struct perf_event *event);
+static void perf_buffer_put(struct perf_buffer *buffer);
-static void perf_mmap_data_put(struct perf_mmap_data *data);
 static void free_event(struct perf_event *event)
 {
-        perf_pending_sync(event);
+        irq_work_sync(&event->pending);
        if (!event->parent) {
-                atomic_dec(&nr_events);
+                if (event->attach_state & PERF_ATTACH_TASK)
-                if (event->attr.mmap)
+                        jump_label_dec(&perf_task_events);
+                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
                        atomic_dec(&nr_comm_events);
                if (event->attr.task)
                        atomic_dec(&nr_task_events);
+                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                        put_callchain_buffers();
        }
-        if (event->data) {
+        if (event->buffer) {
-                perf_mmap_data_put(event->data);
+                perf_buffer_put(event->buffer);
-                event->data = NULL;
+                event->buffer = NULL;
        }
        if (event->destroy)
                event->destroy(event);
-        put_ctx(event->ctx);
+        if (event->ctx)
+                put_ctx(event->ctx);
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -2126,13 +2417,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
        struct perf_event *event = file->private_data;
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        unsigned int events = POLL_HUP;
        rcu_read_lock();
-        data = rcu_dereference(event->data);
+        buffer = rcu_dereference(event->buffer);
-        if (data)
+        if (buffer)
-                events = atomic_xchg(&data->poll, 0);
+                events = atomic_xchg(&buffer->poll, 0);
        rcu_read_unlock();
        poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2434,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_event_reset(struct perf_event *event)
 {
        (void)perf_event_read(event);
-        atomic64_set(&event->count, 0);
+        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
 }
@@ -2186,15 +2477,13 @@ static void perf_event_for_each(struct perf_event *event,
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
        struct perf_event_context *ctx = event->ctx;
-        unsigned long size;
        int ret = 0;
        u64 value;
        if (!event->attr.sample_period)
                return -EINVAL;
-        size = copy_from_user(&value, arg, sizeof(value));
+        if (copy_from_user(&value, arg, sizeof(value)))
-        if (size != sizeof(value))
                return -EFAULT;
        if (!value)
@@ -2328,6 +2617,9 @@ int perf_event_task_disable(void)
 static int perf_event_index(struct perf_event *event)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
@@ -2342,14 +2634,14 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        rcu_read_lock();
-        data = rcu_dereference(event->data);
+        buffer = rcu_dereference(event->buffer);
-        if (!data)
+        if (!buffer)
                goto unlock;
-        userpg = data->user_page;
+        userpg = buffer->user_page;
        /*
         * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2651,9 @@ void perf_event_update_userpage(struct perf_event *event)
        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
-        userpg->offset = atomic64_read(&event->count);
+        userpg->offset = perf_event_count(event);
        if (event->state == PERF_EVENT_STATE_ACTIVE)
-                userpg->offset -= atomic64_read(&event->hw.prev_count);
+                userpg->offset -= local64_read(&event->hw.prev_count);
        userpg->time_enabled = event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2668,25 @@ unlock:
        rcu_read_unlock();
 }
+static unsigned long perf_data_size(struct perf_buffer *buffer);
+static void
+perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
+{
+        long max_size = perf_data_size(buffer);
+        if (watermark)
+                buffer->watermark = min(max_size, watermark);
+        if (!buffer->watermark)
+                buffer->watermark = max_size / 2;
+        if (flags & PERF_BUFFER_WRITABLE)
+                buffer->writable = 1;
+        atomic_set(&buffer->refcount, 1);
+}
 #ifndef CONFIG_PERF_USE_VMALLOC
 /*
@@ -2383,15 +2694,15 @@ unlock:
 */
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-        if (pgoff > data->nr_pages)
+        if (pgoff > buffer->nr_pages)
                return NULL;
        if (pgoff == 0)
-                return virt_to_page(data->user_page);
+                return virt_to_page(buffer->user_page);
-        return virt_to_page(data->data_pages[pgoff - 1]);
+        return virt_to_page(buffer->data_pages[pgoff - 1]);
 }
 static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2718,44 @@ static void *perf_mmap_alloc_page(int cpu)
        return page_address(page);
 }
-static struct perf_mmap_data *
+static struct perf_buffer *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        unsigned long size;
        int i;
-        size = sizeof(struct perf_mmap_data);
+        size = sizeof(struct perf_buffer);
        size += nr_pages * sizeof(void *);
-        data = kzalloc(size, GFP_KERNEL);
+        buffer = kzalloc(size, GFP_KERNEL);
-        if (!data)
+        if (!buffer)
                goto fail;
-        data->user_page = perf_mmap_alloc_page(event->cpu);
+        buffer->user_page = perf_mmap_alloc_page(cpu);
-        if (!data->user_page)
+        if (!buffer->user_page)
                goto fail_user_page;
        for (i = 0; i < nr_pages; i++) {
-                data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
+                buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
-                if (!data->data_pages[i])
+                if (!buffer->data_pages[i])
                        goto fail_data_pages;
        }
-        data->nr_pages = nr_pages;
+        buffer->nr_pages = nr_pages;
-        return data;
+        perf_buffer_init(buffer, watermark, flags);
+        return buffer;
 fail_data_pages:
        for (i--; i >= 0; i--)
-                free_page((unsigned long)data->data_pages[i]);
+                free_page((unsigned long)buffer->data_pages[i]);
-        free_page((unsigned long)data->user_page);
+        free_page((unsigned long)buffer->user_page);
 fail_user_page:
-        kfree(data);
+        kfree(buffer);
 fail:
        return NULL;
@@ -2456,17 +2769,17 @@ static void perf_mmap_free_page(unsigned long addr)
        __free_page(page);
 }
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
        int i;
-        perf_mmap_free_page((unsigned long)data->user_page);
+        perf_mmap_free_page((unsigned long)buffer->user_page);
-        for (i = 0; i < data->nr_pages; i++)
+        for (i = 0; i < buffer->nr_pages; i++)
-                perf_mmap_free_page((unsigned long)data->data_pages[i]);
+                perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
-        kfree(data);
+        kfree(buffer);
 }
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
 {
        return 0;
 }
@@ -2479,18 +2792,18 @@ static inline int page_order(struct perf_mmap_data *data)
 * Required for architectures that have d-cache aliasing issues.
 */
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
 {
-        return data->page_order;
+        return buffer->page_order;
 }
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-        if (pgoff > (1UL << page_order(data)))
+        if (pgoff > (1UL << page_order(buffer)))
                return NULL;
-        return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+        return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
 }
 static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2813,59 @@ static void perf_mmap_unmark_page(void *addr)
        page->mapping = NULL;
 }
-static void perf_mmap_data_free_work(struct work_struct *work)
+static void perf_buffer_free_work(struct work_struct *work)
 {
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        void *base;
        int i, nr;
-        data = container_of(work, struct perf_mmap_data, work);
+        buffer = container_of(work, struct perf_buffer, work);
-        nr = 1 << page_order(data);
+        nr = 1 << page_order(buffer);
-        base = data->user_page;
+        base = buffer->user_page;
        for (i = 0; i < nr + 1; i++)
                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
        vfree(base);
-        kfree(data);
+        kfree(buffer);
 }
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
-        schedule_work(&data->work);
+        schedule_work(&buffer->work);
 }
-static struct perf_mmap_data *
+static struct perf_buffer *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        unsigned long size;
        void *all_buf;
-        size = sizeof(struct perf_mmap_data);
+        size = sizeof(struct perf_buffer);
        size += sizeof(void *);
-        data = kzalloc(size, GFP_KERNEL);
+        buffer = kzalloc(size, GFP_KERNEL);
-        if (!data)
+        if (!buffer)
                goto fail;
-        INIT_WORK(&data->work, perf_mmap_data_free_work);
+        INIT_WORK(&buffer->work, perf_buffer_free_work);
        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
        if (!all_buf)
                goto fail_all_buf;
-        data->user_page = all_buf;
+        buffer->user_page = all_buf;
-        data->data_pages[0] = all_buf + PAGE_SIZE;
+        buffer->data_pages[0] = all_buf + PAGE_SIZE;
-        data->page_order = ilog2(nr_pages);
+        buffer->page_order = ilog2(nr_pages);
-        data->nr_pages = 1;
+        buffer->nr_pages = 1;
+        perf_buffer_init(buffer, watermark, flags);
-        return data;
+        return buffer;
 fail_all_buf:
-        kfree(data);
+        kfree(buffer);
 fail:
        return NULL;
@@ -2558,15 +2873,15 @@ fail:
 #endif
-static unsigned long perf_data_size(struct perf_mmap_data *data)
+static unsigned long perf_data_size(struct perf_buffer *buffer)
 {
-        return data->nr_pages << (PAGE_SHIFT + page_order(data));
+        return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
 }
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        int ret = VM_FAULT_SIGBUS;
        if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2891,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        rcu_read_lock();
-        data = rcu_dereference(event->data);
+        buffer = rcu_dereference(event->buffer);
-        if (!data)
+        if (!buffer)
                goto unlock;
        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;
-        vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+        vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
        if (!vmf->page)
                goto unlock;
@@ -2598,52 +2913,35 @@ unlock:
        return ret;
 }
-static void
+static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
-perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
-{
-        long max_size = perf_data_size(data);
-        if (event->attr.watermark) {
-                data->watermark = min_t(long, max_size,
-                                        event->attr.wakeup_watermark);
-        }
-        if (!data->watermark)
-                data->watermark = max_size / 2;
-        atomic_set(&data->refcount, 1);
-        rcu_assign_pointer(event->data, data);
-}
-static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
 {
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
-        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+        buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
-        perf_mmap_data_free(data);
+        perf_buffer_free(buffer);
 }
-static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
+static struct perf_buffer *perf_buffer_get(struct perf_event *event)
 {
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        rcu_read_lock();
-        data = rcu_dereference(event->data);
+        buffer = rcu_dereference(event->buffer);
-        if (data) {
+        if (buffer) {
-                if (!atomic_inc_not_zero(&data->refcount))
+                if (!atomic_inc_not_zero(&buffer->refcount))
-                        data = NULL;
+                        buffer = NULL;
        }
        rcu_read_unlock();
-        return data;
+        return buffer;
 }
-static void perf_mmap_data_put(struct perf_mmap_data *data)
+static void perf_buffer_put(struct perf_buffer *buffer)
 {
-        if (!atomic_dec_and_test(&data->refcount))
+        if (!atomic_dec_and_test(&buffer->refcount))
                return;
-        call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
+        call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
 }
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2956,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-                unsigned long size = perf_data_size(event->data);
+                unsigned long size = perf_data_size(event->buffer);
                struct user_struct *user = event->mmap_user;
-                struct perf_mmap_data *data = event->data;
+                struct perf_buffer *buffer = event->buffer;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
                vma->vm_mm->locked_vm -= event->mmap_locked;
-                rcu_assign_pointer(event->data, NULL);
+                rcu_assign_pointer(event->buffer, NULL);
                mutex_unlock(&event->mmap_mutex);
-                perf_mmap_data_put(data);
+                perf_buffer_put(buffer);
                free_uid(user);
        }
 }
@@ -2685,11 +2983,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        unsigned long locked, lock_limit;
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra, extra;
-        int ret = 0;
+        int ret = 0, flags = 0;
        /*
         * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +3004,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        nr_pages = (vma_size / PAGE_SIZE) - 1;
        /*
-         * If we have data pages ensure they're a power-of-two number, so we
+         * If we have buffer pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +3018,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->mmap_mutex);
-        if (event->data) {
+        if (event->buffer) {
-                if (event->data->nr_pages == nr_pages)
+                if (event->buffer->nr_pages == nr_pages)
-                        atomic_inc(&event->data->refcount);
+                        atomic_inc(&event->buffer->refcount);
                else
                        ret = -EINVAL;
                goto unlock;
@@ -2752,17 +3050,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                goto unlock;
        }
-        WARN_ON(event->data);
+        WARN_ON(event->buffer);
+        if (vma->vm_flags & VM_WRITE)
+                flags |= PERF_BUFFER_WRITABLE;
-        data = perf_mmap_data_alloc(event, nr_pages);
+        buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
-        if (!data) {
+                                   event->cpu, flags);
+        if (!buffer) {
                ret = -ENOMEM;
                goto unlock;
        }
+        rcu_assign_pointer(event->buffer, buffer);
-        perf_mmap_data_init(event, data);
-        if (vma->vm_flags & VM_WRITE)
-                event->data->writable = 1;
        atomic_long_add(user_extra, &user->locked_vm);
        event->mmap_locked = extra;
@@ -2824,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event)
        }
 }
-/*
+static void perf_pending_event(struct irq_work *entry)
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-static void perf_pending_event(struct perf_pending_entry *entry)
 {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
@@ -2849,104 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
        }
 }
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-        PENDING_TAIL,
-};
-static void perf_pending_queue(struct perf_pending_entry *entry,
-                               void (*func)(struct perf_pending_entry *))
-{
-        struct perf_pending_entry **head;
-        if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-                return;
-        entry->func = func;
-        head = &get_cpu_var(perf_pending_head);
-        do {
-                entry->next = *head;
-        } while (cmpxchg(head, entry->next, entry) != entry->next);
-        set_perf_event_pending();
-        put_cpu_var(perf_pending_head);
-}
-static int __perf_pending_run(void)
-{
-        struct perf_pending_entry *list;
-        int nr = 0;
-        list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-        while (list != PENDING_TAIL) {
-                void (*func)(struct perf_pending_entry *);
-                struct perf_pending_entry *entry = list;
-                list = list->next;
-                func = entry->func;
-                entry->next = NULL;
-                /*
-                 * Ensure we observe the unqueue before we issue the wakeup,
-                 * so that we won't be waiting forever.
-                 * -- see perf_not_pending().
-                 */
-                smp_wmb();
-                func(entry);
-                nr++;
-        }
-        return nr;
-}
-static inline int perf_not_pending(struct perf_event *event)
-{
-        /*
-         * If we flush on whatever cpu we run, there is a chance we don't
-         * need to wait.
-         */
-        get_cpu();
-        __perf_pending_run();
-        put_cpu();
-        /*
-         * Ensure we see the proper queue state before going to sleep
-         * so that we do not miss the wakeup. -- see perf_pending_handle()
-         */
-        smp_rmb();
-        return event->pending.next == NULL;
-}
-static void perf_pending_sync(struct perf_event *event)
-{
-        wait_event(event->waitq, perf_not_pending(event));
-}
-void perf_event_do_pending(void)
-{
-        __perf_pending_run();
-}
-/*
- * Callchain support -- arch specific
- */
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-        return NULL;
-}
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
 /*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
@@ -2971,15 +3163,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 /*
 * Output
 */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
                              unsigned long offset, unsigned long head)
 {
        unsigned long mask;
-        if (!data->writable)
+        if (!buffer->writable)
                return true;
-        mask = perf_data_size(data) - 1;
+        mask = perf_data_size(buffer) - 1;
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@ -2992,12 +3184,11 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-        atomic_set(&handle->data->poll, POLL_IN);
+        atomic_set(&handle->buffer->poll, POLL_IN);
        if (handle->nmi) {
                handle->event->pending_wakeup = 1;
-                perf_pending_queue(&handle->event->pending,
+                irq_work_queue(&handle->event->pending);
-                                   perf_pending_event);
        } else
                perf_event_wakeup(handle->event);
 }
@@ -3012,48 +3203,48 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 */
 static void perf_output_get_handle(struct perf_output_handle *handle)
 {
-        struct perf_mmap_data *data = handle->data;
+        struct perf_buffer *buffer = handle->buffer;
        preempt_disable();
-        local_inc(&data->nest);
+        local_inc(&buffer->nest);
-        handle->wakeup = local_read(&data->wakeup);
+        handle->wakeup = local_read(&buffer->wakeup);
 }
 static void perf_output_put_handle(struct perf_output_handle *handle)
 {
-        struct perf_mmap_data *data = handle->data;
+        struct perf_buffer *buffer = handle->buffer;
        unsigned long head;
 again:
-        head = local_read(&data->head);
+        head = local_read(&buffer->head);
        /*
         * IRQ/NMI can happen here, which means we can miss a head update.
         */
-        if (!local_dec_and_test(&data->nest))
+        if (!local_dec_and_test(&buffer->nest))
                goto out;
        /*
         * Publish the known good head. Rely on the full barrier implied
-         * by atomic_dec_and_test() order the data->head read and this
+         * by atomic_dec_and_test() order the buffer->head read and this
         * write.
         */
-        data->user_page->data_head = head;
+        buffer->user_page->data_head = head;
        /*
         * Now check if we missed an update, rely on the (compiler)
-         * barrier in atomic_dec_and_test() to re-read data->head.
+         * barrier in atomic_dec_and_test() to re-read buffer->head.
         */
-        if (unlikely(head != local_read(&data->head))) {
+        if (unlikely(head != local_read(&buffer->head))) {
-                local_inc(&data->nest);
+                local_inc(&buffer->nest);
                goto again;
        }
-        if (handle->wakeup != local_read(&data->wakeup))
+        if (handle->wakeup != local_read(&buffer->wakeup))
                perf_output_wakeup(handle);
- out:
+out:
        preempt_enable();
 }
@@ -3070,12 +3261,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
                buf += size;
                handle->size -= size;
                if (!handle->size) {
-                        struct perf_mmap_data *data = handle->data;
+                        struct perf_buffer *buffer = handle->buffer;
                        handle->page++;
-                        handle->page &= data->nr_pages - 1;
+                        handle->page &= buffer->nr_pages - 1;
-                        handle->addr = data->data_pages[handle->page];
+                        handle->addr = buffer->data_pages[handle->page];
-                        handle->size = PAGE_SIZE << page_order(data);
+                        handle->size = PAGE_SIZE << page_order(buffer);
                }
        } while (len);
 }
@@ -3084,7 +3275,7 @@ int perf_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event, unsigned int size,
                      int nmi, int sample)
 {
-        struct perf_mmap_data *data;
+        struct perf_buffer *buffer;
        unsigned long tail, offset, head;
        int have_lost;
        struct {
@@ -3100,19 +3291,19 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (event->parent)
                event = event->parent;
-        data = rcu_dereference(event->data);
+        buffer = rcu_dereference(event->buffer);
-        if (!data)
+        if (!buffer)
                goto out;
-        handle->data    = data;
+        handle->buffer  = buffer;
        handle->event   = event;
        handle->nmi     = nmi;
        handle->sample  = sample;
-        if (!data->nr_pages)
+        if (!buffer->nr_pages)
                goto out;
-        have_lost = local_read(&data->lost);
+        have_lost = local_read(&buffer->lost);
        if (have_lost)
                size += sizeof(lost_event);
@@ -3124,30 +3315,30 @@ int perf_output_begin(struct perf_output_handle *handle,
                 * tail pointer. So that all reads will be completed before the
                 * write is issued.
                 */
-                tail = ACCESS_ONCE(data->user_page->data_tail);
+                tail = ACCESS_ONCE(buffer->user_page->data_tail);
                smp_rmb();
-                offset = head = local_read(&data->head);
+                offset = head = local_read(&buffer->head);
                head += size;
-                if (unlikely(!perf_output_space(data, tail, offset, head)))
+                if (unlikely(!perf_output_space(buffer, tail, offset, head)))
                        goto fail;
-        } while (local_cmpxchg(&data->head, offset, head) != offset);
+        } while (local_cmpxchg(&buffer->head, offset, head) != offset);
-        if (head - local_read(&data->wakeup) > data->watermark)
+        if (head - local_read(&buffer->wakeup) > buffer->watermark)
-                local_add(data->watermark, &data->wakeup);
+                local_add(buffer->watermark, &buffer->wakeup);
-        handle->page = offset >> (PAGE_SHIFT + page_order(data));
+        handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
-        handle->page &= data->nr_pages - 1;
+        handle->page &= buffer->nr_pages - 1;
-        handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
+        handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
-        handle->addr = data->data_pages[handle->page];
+        handle->addr = buffer->data_pages[handle->page];
        handle->addr += handle->size;
-        handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
+        handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
-                lost_event.lost        = local_xchg(&data->lost, 0);
+                lost_event.lost        = local_xchg(&buffer->lost, 0);
                perf_output_put(handle, lost_event);
        }
@@ -3155,7 +3346,7 @@ int perf_output_begin(struct perf_output_handle *handle,
        return 0;
 fail:
-        local_inc(&data->lost);
+        local_inc(&buffer->lost);
        perf_output_put_handle(handle);
 out:
        rcu_read_unlock();
@@ -3166,15 +3357,15 @@ out:
 void perf_output_end(struct perf_output_handle *handle)
 {
        struct perf_event *event = handle->event;
-        struct perf_mmap_data *data = handle->data;
+        struct perf_buffer *buffer = handle->buffer;
        int wakeup_events = event->attr.wakeup_events;
        if (handle->sample && wakeup_events) {
-                int events = local_inc_return(&data->events);
+                int events = local_inc_return(&buffer->events);
                if (events >= wakeup_events) {
-                        local_sub(wakeup_events, &data->events);
+                        local_sub(wakeup_events, &buffer->events);
-                        local_inc(&data->wakeup);
+                        local_inc(&buffer->wakeup);
                }
        }
@@ -3211,7 +3402,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
        u64 values[4];
        int n = 0;
-        values[n++] = atomic64_read(&event->count);
+        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3439,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        if (leader != event)
                leader->pmu->read(leader);
-        values[n++] = atomic64_read(&leader->count);
+        values[n++] = perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
@@ -3260,7 +3451,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
                if (sub != event)
                        sub->pmu->read(sub);
-                values[n++] = atomic64_read(&sub->count);
+                values[n++] = perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
@@ -3441,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
        struct perf_output_handle handle;
        struct perf_event_header header;
+        /* protect the callchain buffers */
+        rcu_read_lock();
        perf_prepare_sample(&header, data, event, regs);
        if (perf_output_begin(&handle, event, header.size, nmi, 1))
-                return;
+                goto exit;
        perf_output_sample(&handle, &header, data, event);
        perf_output_end(&handle);
+exit:
+        rcu_read_unlock();
 }
 /*
@@ -3491,7 +3688,7 @@ perf_event_read_event(struct perf_event *event,
 /*
 * task tracking -- fork/exit
 *
- * enabled by: attr.comm | attr.mmap | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
 */
 struct perf_task_event {
@@ -3541,7 +3738,8 @@ static int perf_event_task_match(struct perf_event *event)
        if (event->cpu != -1 && event->cpu != smp_processor_id())
                return 0;
-        if (event->attr.comm || event->attr.mmap || event->attr.task)
+        if (event->attr.comm || event->attr.mmap ||
+            event->attr.mmap_data || event->attr.task)
                return 1;
        return 0;
@@ -3561,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx = task_event->task_ctx;
+        struct perf_event_context *ctx;
+        struct pmu *pmu;
+        int ctxn;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_task_ctx(&cpuctx->ctx, task_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        if (!ctx)
+                perf_event_task_ctx(&cpuctx->ctx, task_event);
-                ctx = rcu_dereference(current->perf_event_ctxp);
-        if (ctx)
+                ctx = task_event->task_ctx;
-                perf_event_task_ctx(ctx, task_event);
+                if (!ctx) {
-        put_cpu_var(perf_cpu_context);
+                        ctxn = pmu->task_ctx_nr;
+                        if (ctxn < 0)
+                                goto next;
+                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                }
+                if (ctx)
+                        perf_event_task_ctx(ctx, task_event);
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
 }
@@ -3675,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-        unsigned int size;
        char comm[TASK_COMM_LEN];
+        unsigned int size;
+        struct pmu *pmu;
+        int ctxn;
        memset(comm, 0, sizeof(comm));
        strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3688,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        ctx = rcu_dereference(current->perf_event_ctxp);
+                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-        if (ctx)
-                perf_event_comm_ctx(ctx, comm_event);
+                ctxn = pmu->task_ctx_nr;
-        put_cpu_var(perf_cpu_context);
+                if (ctxn < 0)
+                        goto next;
+                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                if (ctx)
+                        perf_event_comm_ctx(ctx, comm_event);
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
 }
 void perf_event_comm(struct task_struct *task)
 {
        struct perf_comm_event comm_event;
+        struct perf_event_context *ctx;
+        int ctxn;
-        if (task->perf_event_ctxp)
+        for_each_task_context_nr(ctxn) {
-                perf_event_enable_on_exec(task);
+                ctx = task->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
+                perf_event_enable_on_exec(ctx);
+        }
        if (!atomic_read(&nr_comm_events))
                return;
@@ -3766,7 +3992,8 @@ static void perf_event_mmap_output(struct perf_event *event,
 }
 static int perf_event_mmap_match(struct perf_event *event,
-                                   struct perf_mmap_event *mmap_event)
+                                   struct perf_mmap_event *mmap_event,
+                                   int executable)
 {
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
@@ -3774,19 +4001,21 @@ static int perf_event_mmap_match(struct perf_event *event,
        if (event->cpu != -1 && event->cpu != smp_processor_id())
                return 0;
-        if (event->attr.mmap)
+        if ((!executable && event->attr.mmap_data) ||
+            (executable && event->attr.mmap))
                return 1;
        return 0;
 }
 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-                                  struct perf_mmap_event *mmap_event)
+                                  struct perf_mmap_event *mmap_event,
+                                  int executable)
 {
        struct perf_event *event;
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                if (perf_event_mmap_match(event, mmap_event))
+                if (perf_event_mmap_match(event, mmap_event, executable))
                        perf_event_mmap_output(event, mmap_event);
        }
 }
@@ -3801,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        char tmp[16];
        char *buf = NULL;
        const char *name;
+        struct pmu *pmu;
+        int ctxn;
        memset(tmp, 0, sizeof(tmp));
@@ -3830,6 +4061,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                if (!vma->vm_mm) {
                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
                        goto got_name;
+                } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+                                vma->vm_end >= vma->vm_mm->brk) {
+                        name = strncpy(tmp, "[heap]", sizeof(tmp));
+                        goto got_name;
+                } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+                                vma->vm_end >= vma->vm_mm->start_stack) {
+                        name = strncpy(tmp, "[stack]", sizeof(tmp));
+                        goto got_name;
                }
                name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3845,18 +4084,29 @@ got_name:
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        ctx = rcu_dereference(current->perf_event_ctxp);
+                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
-        if (ctx)
+                                        vma->vm_flags & VM_EXEC);
-                perf_event_mmap_ctx(ctx, mmap_event);
-        put_cpu_var(perf_cpu_context);
+                ctxn = pmu->task_ctx_nr;
+                if (ctxn < 0)
+                        goto next;
+                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                if (ctx) {
+                        perf_event_mmap_ctx(ctx, mmap_event,
+                                        vma->vm_flags & VM_EXEC);
+                }
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
        kfree(buf);
 }
-void __perf_event_mmap(struct vm_area_struct *vma)
+void perf_event_mmap(struct vm_area_struct *vma)
 {
        struct perf_mmap_event mmap_event;
@@ -3932,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
-        throttle = (throttle && event->pmu->unthrottle != NULL);
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -3976,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                event->pending_kill = POLL_HUP;
                if (nmi) {
                        event->pending_disable = 1;
-                        perf_pending_queue(&event->pending,
+                        irq_work_queue(&event->pending);
-                                           perf_pending_event);
                } else
                        perf_event_disable(event);
        }
@@ -4001,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
 * Generic software event infrastructure
 */
+struct swevent_htable {
+        struct swevent_hlist            *swevent_hlist;
+        struct mutex                    hlist_mutex;
+        int                             hlist_refcount;
+        /* Recursion avoidance in each contexts */
+        int                             recursion[PERF_NR_CONTEXTS];
+};
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 /*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
@@ -4018,14 +4276,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
        hwc->last_period = hwc->sample_period;
 again:
-        old = val = atomic64_read(&hwc->period_left);
+        old = val = local64_read(&hwc->period_left);
        if (val < 0)
                return 0;
        nr = div64_u64(period + val, period);
        offset = nr * period;
        val -= offset;
-        if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+        if (local64_cmpxchg(&hwc->period_left, old, val) != old)
                goto again;
        return nr;
@@ -4058,13 +4316,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        }
 }
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
                               int nmi, struct perf_sample_data *data,
                               struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
-        atomic64_add(nr, &event->count);
+        local64_add(nr, &event->count);
        if (!regs)
                return;
@@ -4075,7 +4333,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, nmi, data, regs);
-        if (atomic64_add_negative(nr, &hwc->period_left))
+        if (local64_add_negative(nr, &hwc->period_left))
                return;
        perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4084,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;
@@ -4130,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 /* For the read side: events when they trigger */
 static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
 {
        struct swevent_hlist *hlist;
-        hlist = rcu_dereference(ctx->swevent_hlist);
+        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;
@@ -4143,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 /* For the event head insertion and removal in the hlist */
 static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 {
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
@@ -4154,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
-        hlist = rcu_dereference_protected(ctx->swevent_hlist,
+        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;
@@ -4167,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-        struct perf_cpu_context *cpuctx;
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct perf_event *event;
        struct hlist_node *node;
        struct hlist_head *head;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
        rcu_read_lock();
+        head = find_swevent_head_rcu(swhash, type, event_id);
-        head = find_swevent_head_rcu(cpuctx, type, event_id);
        if (!head)
                goto end;
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
-                        perf_swevent_add(event, nr, nmi, data, regs);
+                        perf_swevent_event(event, nr, nmi, data, regs);
        }
 end:
        rcu_read_unlock();
@@ -4191,36 +4448,18 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
-        int rctx;
-        if (in_nmi())
-                rctx = 3;
-        else if (in_irq())
-                rctx = 2;
-        else if (in_softirq())
-                rctx = 1;
-        else
-                rctx = 0;
-        if (cpuctx->recursion[rctx])
+        return get_recursion_context(swhash->recursion);
-                return -1;
-        cpuctx->recursion[rctx]++;
-        barrier();
-        return rctx;
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void perf_swevent_put_recursion_context(int rctx)
+void inline perf_swevent_put_recursion_context(int rctx)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
-        barrier();
-        cpuctx->recursion[rctx]--;
-}
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
+        put_recursion_context(swhash->recursion, rctx);
+}
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
@@ -4245,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
 {
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
-        struct perf_cpu_context *cpuctx;
        struct hlist_head *head;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
-        head = find_swevent_head(cpuctx, event);
+        hwc->state = !(flags & PERF_EF_START);
+        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
@@ -4267,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event)
        return 0;
 }
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
 {
        hlist_del_rcu(&event->hlist_entry);
 }
-static void perf_swevent_void(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
-{
-}
-static int perf_swevent_int(struct perf_event *event)
-{
-        return 0;
-}
-static const struct pmu perf_ops_generic = {
-        .enable         = perf_swevent_enable,
-        .disable        = perf_swevent_disable,
-        .start          = perf_swevent_int,
-        .stop           = perf_swevent_void,
-        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-/*
- * hrtimer based swevent callback
- */
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-        enum hrtimer_restart ret = HRTIMER_RESTART;
+        event->hw.state = 0;
-        struct perf_sample_data data;
-        struct pt_regs *regs;
-        struct perf_event *event;
-        u64 period;
-        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-        event->pmu->read(event);
-        perf_sample_data_init(&data, 0);
-        data.period = event->hw.last_period;
-        regs = get_irq_regs();
-        if (regs && !perf_exclude_event(event, regs)) {
-                if (!(event->attr.exclude_idle && current->pid == 0))
-                        if (perf_event_overflow(event, 0, &data, regs))
-                                ret = HRTIMER_NORESTART;
-        }
-        period = max_t(u64, 10000, event->hw.sample_period);
-        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-        return ret;
 }
-static void perf_swevent_start_hrtimer(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
 {
-        struct hw_perf_event *hwc = &event->hw;
+        event->hw.state = PERF_HES_STOPPED;
-        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                u64 period;
-                if (hwc->remaining) {
-                        if (hwc->remaining < 0)
-                                period = 10000;
-                        else
-                                period = hwc->remaining;
-                        hwc->remaining = 0;
-                } else {
-                        period = max_t(u64, 10000, hwc->sample_period);
-                }
-                __hrtimer_start_range_ns(&hwc->hrtimer,
-                                ns_to_ktime(period), 0,
-                                HRTIMER_MODE_REL, 0);
-        }
-}
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        if (hwc->sample_period) {
-                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-                hwc->remaining = ktime_to_ns(remaining);
-                hrtimer_cancel(&hwc->hrtimer);
-        }
-}
-/*
- * Software event: cpu wall time clock
- */
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-        int cpu = raw_smp_processor_id();
-        s64 prev;
-        u64 now;
-        now = cpu_clock(cpu);
-        prev = atomic64_xchg(&event->hw.prev_count, now);
-        atomic64_add(now - prev, &event->count);
-}
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        int cpu = raw_smp_processor_id();
-        atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-        perf_swevent_start_hrtimer(event);
-        return 0;
-}
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-        perf_swevent_cancel_hrtimer(event);
-        cpu_clock_perf_event_update(event);
-}
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-        cpu_clock_perf_event_update(event);
-}
-static const struct pmu perf_ops_cpu_clock = {
-        .enable         = cpu_clock_perf_event_enable,
-        .disable        = cpu_clock_perf_event_disable,
-        .read           = cpu_clock_perf_event_read,
-};
-/*
- * Software event: task time clock
- */
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-        u64 prev;
-        s64 delta;
-        prev = atomic64_xchg(&event->hw.prev_count, now);
-        delta = now - prev;
-        atomic64_add(delta, &event->count);
-}
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        u64 now;
-        now = event->ctx->time;
-        atomic64_set(&hwc->prev_count, now);
-        perf_swevent_start_hrtimer(event);
-        return 0;
-}
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-        perf_swevent_cancel_hrtimer(event);
-        task_clock_perf_event_update(event, event->ctx->time);
-}
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-        u64 time;
-        if (!in_nmi()) {
-                update_context_time(event->ctx);
-                time = event->ctx->time;
-        } else {
-                u64 now = perf_clock();
-                u64 delta = now - event->ctx->timestamp;
-                time = event->ctx->time + delta;
-        }
-        task_clock_perf_event_update(event, time);
 }
-static const struct pmu perf_ops_task_clock = {
-        .enable         = task_clock_perf_event_enable,
-        .disable        = task_clock_perf_event_disable,
-        .read           = task_clock_perf_event_read,
-};
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
 {
-        return rcu_dereference_protected(cpuctx->swevent_hlist,
+        return rcu_dereference_protected(swhash->swevent_hlist,
-                                         lockdep_is_held(&cpuctx->hlist_mutex));
+                                         lockdep_is_held(&swhash->hlist_mutex));
 }
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4473,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
        kfree(hlist);
 }
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
 {
-        struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
        if (!hlist)
                return;
-        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+        rcu_assign_pointer(swhash->swevent_hlist, NULL);
        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        if (!--cpuctx->hlist_refcount)
+        if (!--swhash->hlist_refcount)
-                swevent_hlist_release(cpuctx);
+                swevent_hlist_release(swhash);
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
 }
 static void swevent_hlist_put(struct perf_event *event)
@@ -4511,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event)
 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4524,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                        err = -ENOMEM;
                        goto exit;
                }
-                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-        cpuctx->hlist_refcount++;
+        swhash->hlist_refcount++;
- exit:
+exit:
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
        return err;
 }
@@ -4552,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event)
        put_online_cpus();
        return 0;
- fail:
+fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
@@ -4563,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event)
        return err;
 }
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+        u64 event_id = event->attr.config;
+        WARN_ON(event->parent);
+        jump_label_dec(&perf_swevent_enabled[event_id]);
+        swevent_hlist_put(event);
+}
+static int perf_swevent_init(struct perf_event *event)
+{
+        int event_id = event->attr.config;
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        switch (event_id) {
+        case PERF_COUNT_SW_CPU_CLOCK:
+        case PERF_COUNT_SW_TASK_CLOCK:
+                return -ENOENT;
+        default:
+                break;
+        }
+        if (event_id > PERF_COUNT_SW_MAX)
+                return -ENOENT;
-static const struct pmu perf_ops_tracepoint = {
+        if (!event->parent) {
-        .enable         = perf_trace_enable,
+                int err;
-        .disable        = perf_trace_disable,
-        .start          = perf_swevent_int,
+                err = swevent_hlist_get(event);
-        .stop           = perf_swevent_void,
+                if (err)
+                        return err;
+                jump_label_inc(&perf_swevent_enabled[event_id]);
+                event->destroy = sw_perf_event_destroy;
+        }
+        return 0;
+}
+static struct pmu perf_swevent = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = perf_swevent_init,
+        .add            = perf_swevent_add,
+        .del            = perf_swevent_del,
+        .start          = perf_swevent_start,
+        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_void,
 };
+#ifdef CONFIG_EVENT_TRACING
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
@@ -4601,7 +4712,7 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                   struct pt_regs *regs, struct hlist_head *head)
+                   struct pt_regs *regs, struct hlist_head *head, int rctx)
 {
        struct perf_sample_data data;
        struct perf_event *event;
@@ -4615,12 +4726,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        perf_sample_data_init(&data, addr);
        data.raw = &raw;
-        rcu_read_lock();
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
-                        perf_swevent_add(event, count, 1, &data, regs);
+                        perf_swevent_event(event, count, 1, &data, regs);
        }
-        rcu_read_unlock();
+        perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -4629,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
        perf_trace_destroy(event);
 }
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
 {
        int err;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -ENOENT;
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4640,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
                        perf_paranoid_tracepoint_raw() &&
                        !capable(CAP_SYS_ADMIN))
-                return ERR_PTR(-EPERM);
+                return -EPERM;
        err = perf_trace_init(event);
        if (err)
-                return NULL;
+                return err;
        event->destroy = tp_perf_event_destroy;
-        return &perf_ops_tracepoint;
+        return 0;
+}
+static struct pmu perf_tracepoint = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = perf_tp_event_init,
+        .add            = perf_trace_add,
+        .del            = perf_trace_del,
+        .start          = perf_swevent_start,
+        .stop           = perf_swevent_stop,
+        .read           = perf_swevent_read,
+};
+static inline void perf_tp_register(void)
+{
+        perf_pmu_register(&perf_tracepoint);
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4676,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event)
 #else
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
 {
-        return NULL;
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4693,105 +4822,389 @@ static void perf_event_free_filter(struct perf_event *event)
 #endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-        release_bp_slot(event);
+        struct perf_sample_data sample;
+        struct pt_regs *regs = data;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        if (!bp->hw.state && !perf_exclude_event(bp, regs))
+                perf_swevent_event(bp, 1, 1, &sample, regs);
 }
+#endif
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+/*
+ * hrtimer based swevent callback
+ */
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-        int err;
+        enum hrtimer_restart ret = HRTIMER_RESTART;
+        struct perf_sample_data data;
+        struct pt_regs *regs;
+        struct perf_event *event;
+        u64 period;
-        err = register_perf_hw_breakpoint(bp);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-        if (err)
+        event->pmu->read(event);
-                return ERR_PTR(err);
-        bp->destroy = bp_perf_event_destroy;
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
+        regs = get_irq_regs();
-        return &perf_ops_bp;
+        if (regs && !perf_exclude_event(event, regs)) {
+                if (!(event->attr.exclude_idle && current->pid == 0))
+                        if (perf_event_overflow(event, 0, &data, regs))
+                                ret = HRTIMER_NORESTART;
+        }
+        period = max_t(u64, 10000, event->hw.sample_period);
+        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+        return ret;
 }
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
-        struct perf_sample_data sample;
+        struct hw_perf_event *hwc = &event->hw;
-        struct pt_regs *regs = data;
-        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swevent_hrtimer;
+        if (hwc->sample_period) {
+                s64 period = local64_read(&hwc->period_left);
-        if (!perf_exclude_event(bp, regs))
+                if (period) {
-                perf_swevent_add(bp, 1, 1, &sample, regs);
+                        if (period < 0)
+                                period = 10000;
+                        local64_set(&hwc->period_left, 0);
+                } else {
+                        period = max_t(u64, 10000, hwc->sample_period);
+                }
+                __hrtimer_start_range_ns(&hwc->hrtimer,
+                                ns_to_ktime(period), 0,
+                                HRTIMER_MODE_REL_PINNED, 0);
+        }
 }
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 {
-        return NULL;
+        struct hw_perf_event *hwc = &event->hw;
+        if (hwc->sample_period) {
+                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+                local64_set(&hwc->period_left, ktime_to_ns(remaining));
+                hrtimer_cancel(&hwc->hrtimer);
+        }
 }
-void perf_bp_event(struct perf_event *bp, void *regs)
+/*
+ * Software event: cpu wall time clock
+ */
+static void cpu_clock_event_update(struct perf_event *event)
 {
+        s64 prev;
+        u64 now;
+        now = local_clock();
+        prev = local64_xchg(&event->hw.prev_count, now);
+        local64_add(now - prev, &event->count);
 }
-#endif
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+        local64_set(&event->hw.prev_count, local_clock());
+        perf_swevent_start_hrtimer(event);
+}
-static void sw_perf_event_destroy(struct perf_event *event)
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
 {
-        u64 event_id = event->attr.config;
+        perf_swevent_cancel_hrtimer(event);
+        cpu_clock_event_update(event);
+}
-        WARN_ON(event->parent);
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+        if (flags & PERF_EF_START)
+                cpu_clock_event_start(event, flags);
-        atomic_dec(&perf_swevent_enabled[event_id]);
+        return 0;
-        swevent_hlist_put(event);
 }
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static void cpu_clock_event_del(struct perf_event *event, int flags)
 {
-        const struct pmu *pmu = NULL;
+        cpu_clock_event_stop(event, flags);
-        u64 event_id = event->attr.config;
+}
+static void cpu_clock_event_read(struct perf_event *event)
+{
+        cpu_clock_event_update(event);
+}
+static int cpu_clock_event_init(struct perf_event *event)
+{
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+                return -ENOENT;
+        return 0;
+}
+static struct pmu perf_cpu_clock = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = cpu_clock_event_init,
+        .add            = cpu_clock_event_add,
+        .del            = cpu_clock_event_del,
+        .start          = cpu_clock_event_start,
+        .stop           = cpu_clock_event_stop,
+        .read           = cpu_clock_event_read,
+};
+/*
+ * Software event: task time clock
+ */
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+        u64 prev;
+        s64 delta;
+        prev = local64_xchg(&event->hw.prev_count, now);
+        delta = now - prev;
+        local64_add(delta, &event->count);
+}
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+        local64_set(&event->hw.prev_count, event->ctx->time);
+        perf_swevent_start_hrtimer(event);
+}
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+        perf_swevent_cancel_hrtimer(event);
+        task_clock_event_update(event, event->ctx->time);
+}
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+        if (flags & PERF_EF_START)
+                task_clock_event_start(event, flags);
+        return 0;
+}
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+        task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+static void task_clock_event_read(struct perf_event *event)
+{
+        u64 time;
+        if (!in_nmi()) {
+                update_context_time(event->ctx);
+                time = event->ctx->time;
+        } else {
+                u64 now = perf_clock();
+                u64 delta = now - event->ctx->timestamp;
+                time = event->ctx->time + delta;
+        }
+        task_clock_event_update(event, time);
+}
+static int task_clock_event_init(struct perf_event *event)
+{
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+                return -ENOENT;
+        return 0;
+}
+static struct pmu perf_task_clock = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = task_clock_event_init,
+        .add            = task_clock_event_add,
+        .del            = task_clock_event_del,
+        .start          = task_clock_event_start,
+        .stop           = task_clock_event_stop,
+        .read           = task_clock_event_read,
+};
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+        return 0;
+}
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+        perf_pmu_disable(pmu);
+}
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+        perf_pmu_enable(pmu);
+        return 0;
+}
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+        perf_pmu_enable(pmu);
+}
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
+{
+        struct pmu *pmu;
+        if (ctxn < 0)
+                return NULL;
+        list_for_each_entry(pmu, &pmus, entry) {
+                if (pmu->task_ctx_nr == ctxn)
+                        return pmu->pmu_cpu_context;
+        }
+        return NULL;
+}
+static void free_pmu_context(void * __percpu cpu_context)
+{
+        struct pmu *pmu;
+        mutex_lock(&pmus_lock);
        /*
-         * Software events (currently) can't in general distinguish
+         * Like a real lame refcount.
-         * between user, kernel and hypervisor events.
-         * However, context switches and cpu migrations are considered
-         * to be kernel events, and page faults are never hypervisor
-         * events.
         */
-        switch (event_id) {
+        list_for_each_entry(pmu, &pmus, entry) {
-        case PERF_COUNT_SW_CPU_CLOCK:
+                if (pmu->pmu_cpu_context == cpu_context)
-                pmu = &perf_ops_cpu_clock;
+                        goto out;
+        }
-                break;
+        free_percpu(cpu_context);
-        case PERF_COUNT_SW_TASK_CLOCK:
+out:
-                /*
+        mutex_unlock(&pmus_lock);
-                 * If the user instantiates this as a per-cpu event,
+}
-                 * use the cpu_clock event instead.
-                 */
-                if (event->ctx->task)
-                        pmu = &perf_ops_task_clock;
-                else
-                        pmu = &perf_ops_cpu_clock;
-                break;
+int perf_pmu_register(struct pmu *pmu)
-        case PERF_COUNT_SW_PAGE_FAULTS:
+{
-        case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+        int cpu, ret;
-        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-        case PERF_COUNT_SW_CONTEXT_SWITCHES:
-        case PERF_COUNT_SW_CPU_MIGRATIONS:
-        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-        case PERF_COUNT_SW_EMULATION_FAULTS:
-                if (!event->parent) {
-                        int err;
-                        err = swevent_hlist_get(event);
-                        if (err)
-                                return ERR_PTR(err);
-                        atomic_inc(&perf_swevent_enabled[event_id]);
+        mutex_lock(&pmus_lock);
-                        event->destroy = sw_perf_event_destroy;
+        ret = -ENOMEM;
+        pmu->pmu_disable_count = alloc_percpu(int);
+        if (!pmu->pmu_disable_count)
+                goto unlock;
+        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+        if (pmu->pmu_cpu_context)
+                goto got_cpu_context;
+        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+        if (!pmu->pmu_cpu_context)
+                goto free_pdc;
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                __perf_event_init_context(&cpuctx->ctx);
+                cpuctx->ctx.type = cpu_context;
+                cpuctx->ctx.pmu = pmu;
+                cpuctx->jiffies_interval = 1;
+                INIT_LIST_HEAD(&cpuctx->rotation_list);
+        }
+got_cpu_context:
+        if (!pmu->start_txn) {
+                if (pmu->pmu_enable) {
+                        /*
+                         * If we have pmu_enable/pmu_disable calls, install
+                         * transaction stubs that use that to try and batch
+                         * hardware accesses.
+                         */
+                        pmu->start_txn  = perf_pmu_start_txn;
+                        pmu->commit_txn = perf_pmu_commit_txn;
+                        pmu->cancel_txn = perf_pmu_cancel_txn;
+                } else {
+                        pmu->start_txn  = perf_pmu_nop_void;
+                        pmu->commit_txn = perf_pmu_nop_int;
+                        pmu->cancel_txn = perf_pmu_nop_void;
+                }
+        }
+        if (!pmu->pmu_enable) {
+                pmu->pmu_enable  = perf_pmu_nop_void;
+                pmu->pmu_disable = perf_pmu_nop_void;
+        }
+        list_add_rcu(&pmu->entry, &pmus);
+        ret = 0;
+unlock:
+        mutex_unlock(&pmus_lock);
+        return ret;
+free_pdc:
+        free_percpu(pmu->pmu_disable_count);
+        goto unlock;
+}
+void perf_pmu_unregister(struct pmu *pmu)
+{
+        mutex_lock(&pmus_lock);
+        list_del_rcu(&pmu->entry);
+        mutex_unlock(&pmus_lock);
+        /*
+         * We dereference the pmu list under both SRCU and regular RCU, so
+         * synchronize against both of those.
+         */
+        synchronize_srcu(&pmus_srcu);
+        synchronize_rcu();
+        free_percpu(pmu->pmu_disable_count);
+        free_pmu_context(pmu->pmu_cpu_context);
+}
+struct pmu *perf_init_event(struct perf_event *event)
+{
+        struct pmu *pmu = NULL;
+        int idx;
+        idx = srcu_read_lock(&pmus_srcu);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                int ret = pmu->event_init(event);
+                if (!ret)
+                        goto unlock;
+                if (ret != -ENOENT) {
+                        pmu = ERR_PTR(ret);
+                        goto unlock;
                }
-                pmu = &perf_ops_generic;
-                break;
        }
+        pmu = ERR_PTR(-ENOENT);
+unlock:
+        srcu_read_unlock(&pmus_srcu, idx);
        return pmu;
 }
@@ -4800,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 * Allocate and initialize a event structure
 */
 static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
-                   int cpu,
+                 struct task_struct *task,
-                   struct perf_event_context *ctx,
+                 struct perf_event *group_leader,
-                   struct perf_event *group_leader,
+                 struct perf_event *parent_event,
-                   struct perf_event *parent_event,
+                 perf_overflow_handler_t overflow_handler)
-                   perf_overflow_handler_t overflow_handler,
+{
-                   gfp_t gfpflags)
+        struct pmu *pmu;
-{
-        const struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err;
-        event = kzalloc(sizeof(*event), gfpflags);
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);
@@ -4831,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr,
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        init_waitqueue_head(&event->waitq);
+        init_irq_work(&event->pending, perf_pending_event);
        mutex_init(&event->mmap_mutex);
@@ -4838,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->attr             = *attr;
        event->group_leader     = group_leader;
        event->pmu              = NULL;
-        event->ctx              = ctx;
        event->oncpu            = -1;
        event->parent           = parent_event;
@@ -4848,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
+        if (task) {
+                event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+                /*
+                 * hw_breakpoint is a bit difficult here..
+                 */
+                if (attr->type == PERF_TYPE_BREAKPOINT)
+                        event->hw.bp_target = task;
+#endif
+        }
        if (!overflow_handler && parent_event)
                overflow_handler = parent_event->overflow_handler;
        
@@ -4864,7 +5286,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;
-        atomic64_set(&hwc->period_left, hwc->sample_period);
+        local64_set(&hwc->period_left, hwc->sample_period);
        /*
         * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4872,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr,
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto done;
-        switch (attr->type) {
+        pmu = perf_init_event(event);
-        case PERF_TYPE_RAW:
-        case PERF_TYPE_HARDWARE:
-        case PERF_TYPE_HW_CACHE:
-                pmu = hw_perf_event_init(event);
-                break;
-        case PERF_TYPE_SOFTWARE:
-                pmu = sw_perf_event_init(event);
-                break;
-        case PERF_TYPE_TRACEPOINT:
-                pmu = tp_perf_event_init(event);
-                break;
-        case PERF_TYPE_BREAKPOINT:
-                pmu = bp_perf_event_init(event);
-                break;
-        default:
-                break;
-        }
 done:
        err = 0;
        if (!pmu)
@@ -4912,13 +5313,21 @@ done:
        event->pmu = pmu;
        if (!event->parent) {
-                atomic_inc(&nr_events);
+                if (event->attach_state & PERF_ATTACH_TASK)
-                if (event->attr.mmap)
+                        jump_label_inc(&perf_task_events);
+                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
                        atomic_inc(&nr_comm_events);
                if (event->attr.task)
                        atomic_inc(&nr_task_events);
+                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                        err = get_callchain_buffers();
+                        if (err) {
+                                free_event(event);
+                                return ERR_PTR(err);
+                        }
+                }
        }
        return event;
@@ -5007,7 +5416,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-        struct perf_mmap_data *data = NULL, *old_data = NULL;
+        struct perf_buffer *buffer = NULL, *old_buffer = NULL;
        int ret = -EINVAL;
        if (!output_event)
@@ -5037,19 +5446,19 @@ set:
        if (output_event) {
                /* get the buffer we want to redirect to */
-                data = perf_mmap_data_get(output_event);
+                buffer = perf_buffer_get(output_event);
-                if (!data)
+                if (!buffer)
                        goto unlock;
        }
-        old_data = event->data;
+        old_buffer = event->buffer;
-        rcu_assign_pointer(event->data, data);
+        rcu_assign_pointer(event->buffer, buffer);
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
-        if (old_data)
+        if (old_buffer)
-                perf_mmap_data_put(old_data);
+                perf_buffer_put(old_buffer);
 out:
        return ret;
 }
@@ -5066,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-        struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+        struct perf_event *group_leader = NULL, *output_event = NULL;
+        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct file *group_file = NULL;
+        struct task_struct *task = NULL;
+        struct pmu *pmu;
        int event_fd;
+        int move_group = 0;
        int fput_needed = 0;
        int err;
@@ -5097,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open,
        if (event_fd < 0)
                return event_fd;
-        /*
-         * Get the target context (task or percpu):
-         */
-        ctx = find_get_context(pid, cpu);
-        if (IS_ERR(ctx)) {
-                err = PTR_ERR(ctx);
-                goto err_fd;
-        }
        if (group_fd != -1) {
                group_leader = perf_fget_light(group_fd, &fput_needed);
                if (IS_ERR(group_leader)) {
                        err = PTR_ERR(group_leader);
-                        goto err_put_context;
+                        goto err_fd;
                }
                group_file = group_leader->filp;
                if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5119,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open,
                        group_leader = NULL;
        }
+        if (pid != -1) {
+                task = find_lively_task_by_vpid(pid);
+                if (IS_ERR(task)) {
+                        err = PTR_ERR(task);
+                        goto err_group_fd;
+                }
+        }
+        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+        if (IS_ERR(event)) {
+                err = PTR_ERR(event);
+                goto err_task;
+        }
+        /*
+         * Special case software events and allow them to be part of
+         * any hardware group.
+         */
+        pmu = event->pmu;
+        if (group_leader &&
+            (is_software_event(event) != is_software_event(group_leader))) {
+                if (is_software_event(event)) {
+                        /*
+                         * If event and group_leader are not both a software
+                         * event, and event is, then group leader is not.
+                         *
+                         * Allow the addition of software events to !software
+                         * groups, this is safe because software events never
+                         * fail to schedule.
+                         */
+                        pmu = group_leader->pmu;
+                } else if (is_software_event(group_leader) &&
+                           (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                        /*
+                         * In case the group is a pure software group, and we
+                         * try to add a hardware event, move the whole group to
+                         * the hardware context.
+                         */
+                        move_group = 1;
+                }
+        }
+        /*
+         * Get the target context (task or percpu):
+         */
+        ctx = find_get_context(pmu, task, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_alloc;
+        }
        /*
         * Look up the group leader (we will attach this event to it):
         */
@@ -5130,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open,
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
-                        goto err_put_context;
+                        goto err_context;
                /*
                 * Do not allow to attach to a group in a different
                 * task or CPU context:
                 */
-                if (group_leader->ctx != ctx)
+                if (move_group) {
-                        goto err_put_context;
+                        if (group_leader->ctx->type != ctx->type)
+                                goto err_context;
+                } else {
+                        if (group_leader->ctx != ctx)
+                                goto err_context;
+                }
                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
-                        goto err_put_context;
+                        goto err_context;
-        }
-        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                     NULL, NULL, GFP_KERNEL);
-        if (IS_ERR(event)) {
-                err = PTR_ERR(event);
-                goto err_put_context;
        }
        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
-                        goto err_free_put_context;
+                        goto err_context;
        }
        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
-                goto err_free_put_context;
+                goto err_context;
+        }
+        if (move_group) {
+                struct perf_event_context *gctx = group_leader->ctx;
+                mutex_lock(&gctx->mutex);
+                perf_event_remove_from_context(group_leader);
+                list_for_each_entry(sibling, &group_leader->sibling_list,
+                                    group_entry) {
+                        perf_event_remove_from_context(sibling);
+                        put_ctx(gctx);
+                }
+                mutex_unlock(&gctx->mutex);
+                put_ctx(gctx);
        }
        event->filp = event_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
+        if (move_group) {
+                perf_install_in_context(ctx, group_leader, cpu);
+                get_ctx(ctx);
+                list_for_each_entry(sibling, &group_leader->sibling_list,
+                                    group_entry) {
+                        perf_install_in_context(ctx, sibling, cpu);
+                        get_ctx(ctx);
+                }
+        }
        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
        mutex_unlock(&ctx->mutex);
@@ -5186,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open,
        fd_install(event_fd, event_file);
        return event_fd;
-err_free_put_context:
+err_context:
+        put_ctx(ctx);
+err_alloc:
        free_event(event);
-err_put_context:
+err_task:
+        if (task)
+                put_task_struct(task);
+err_group_fd:
        fput_light(group_file, fput_needed);
-        put_ctx(ctx);
 err_fd:
        put_unused_fd(event_fd);
        return err;
@@ -5201,32 +5685,31 @@ err_fd:
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
 */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                 pid_t pid,
+                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler)
 {
-        struct perf_event *event;
        struct perf_event_context *ctx;
+        struct perf_event *event;
        int err;
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pid, cpu);
+        event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
-        if (IS_ERR(ctx)) {
-                err = PTR_ERR(ctx);
-                goto err_exit;
-        }
-        event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                 NULL, overflow_handler, GFP_KERNEL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
-                goto err_put_context;
+                goto err;
+        }
+        ctx = find_get_context(event->pmu, task, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_free;
        }
        event->filp = NULL;
@@ -5244,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        return event;
- err_put_context:
+err_free:
-        put_ctx(ctx);
+        free_event(event);
- err_exit:
+err:
        return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-              struct task_struct *parent,
-              struct perf_event_context *parent_ctx,
-              struct task_struct *child,
-              struct perf_event *group_leader,
-              struct perf_event_context *child_ctx)
-{
-        struct perf_event *child_event;
-        /*
-         * Instead of creating recursive hierarchies of events,
-         * we link inherited events back to the original parent,
-         * which has a filp for sure, which we use as the reference
-         * count:
-         */
-        if (parent_event->parent)
-                parent_event = parent_event->parent;
-        child_event = perf_event_alloc(&parent_event->attr,
-                                           parent_event->cpu, child_ctx,
-                                           group_leader, parent_event,
-                                           NULL, GFP_KERNEL);
-        if (IS_ERR(child_event))
-                return child_event;
-        get_ctx(child_ctx);
-        /*
-         * Make the child state follow the state of the parent event,
-         * not its attr.disabled bit.  We hold the parent's mutex,
-         * so we won't race with perf_event_{en, dis}able_family.
-         */
-        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-                child_event->state = PERF_EVENT_STATE_INACTIVE;
-        else
-                child_event->state = PERF_EVENT_STATE_OFF;
-        if (parent_event->attr.freq) {
-                u64 sample_period = parent_event->hw.sample_period;
-                struct hw_perf_event *hwc = &child_event->hw;
-                hwc->sample_period = sample_period;
-                hwc->last_period   = sample_period;
-                atomic64_set(&hwc->period_left, sample_period);
-        }
-        child_event->overflow_handler = parent_event->overflow_handler;
-        /*
-         * Link it up in the child's context:
-         */
-        add_event_to_ctx(child_event, child_ctx);
-        /*
-         * Get a reference to the parent filp - we will fput it
-         * when the child event exits. This is safe to do because
-         * we are in the parent and we know that the filp still
-         * exists and has a nonzero count:
-         */
-        atomic_long_inc(&parent_event->filp->f_count);
-        /*
-         * Link this into the parent event's child list
-         */
-        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-        mutex_lock(&parent_event->child_mutex);
-        list_add_tail(&child_event->child_list, &parent_event->child_list);
-        mutex_unlock(&parent_event->child_mutex);
-        return child_event;
-}
-static int inherit_group(struct perf_event *parent_event,
-              struct task_struct *parent,
-              struct perf_event_context *parent_ctx,
-              struct task_struct *child,
-              struct perf_event_context *child_ctx)
-{
-        struct perf_event *leader;
-        struct perf_event *sub;
-        struct perf_event *child_ctr;
-        leader = inherit_event(parent_event, parent, parent_ctx,
-                                 child, NULL, child_ctx);
-        if (IS_ERR(leader))
-                return PTR_ERR(leader);
-        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-                child_ctr = inherit_event(sub, parent, parent_ctx,
-                                            child, leader, child_ctx);
-                if (IS_ERR(child_ctr))
-                        return PTR_ERR(child_ctr);
-        }
-        return 0;
-}
 static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
 {
@@ -5359,12 +5743,12 @@ static void sync_child_event(struct perf_event *child_event,
        if (child_event->attr.inherit_stat)
                perf_event_read_event(child_event, child);
-        child_val = atomic64_read(&child_event->count);
+        child_val = perf_event_count(child_event);
        /*
         * Add back the child's count to the parent's count:
         */
-        atomic64_add(child_val, &parent_event->count);
+        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
@@ -5406,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event,
        }
 }
-/*
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
 {
        struct perf_event *child_event, *tmp;
        struct perf_event_context *child_ctx;
        unsigned long flags;
-        if (likely(!child->perf_event_ctxp)) {
+        if (likely(!child->perf_event_ctxp[ctxn])) {
                perf_event_task(child, NULL, 0);
                return;
        }
@@ -5427,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child)
         * scheduled, so we are now safe from rescheduling changing
         * our context.
         */
-        child_ctx = child->perf_event_ctxp;
+        child_ctx = child->perf_event_ctxp[ctxn];
-        __perf_event_task_sched_out(child_ctx);
+        task_ctx_sched_out(child_ctx, EVENT_ALL);
        /*
         * Take the context lock here so that if find_get_context is
@@ -5436,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child)
         * incremented the context's refcount before we do put_ctx below.
         */
        raw_spin_lock(&child_ctx->lock);
-        child->perf_event_ctxp = NULL;
+        child->perf_event_ctxp[ctxn] = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
@@ -5489,6 +5870,17 @@ again:
        put_ctx(child_ctx);
 }
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+        int ctxn;
+        for_each_task_context_nr(ctxn)
+                perf_event_exit_task_context(child, ctxn);
+}
 static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
 {
@@ -5510,48 +5902,166 @@ static void perf_free_event(struct perf_event *event,
 /*
 * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
 */
 void perf_event_free_task(struct task_struct *task)
 {
-        struct perf_event_context *ctx = task->perf_event_ctxp;
+        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
+        int ctxn;
-        if (!ctx)
+        for_each_task_context_nr(ctxn) {
-                return;
+                ctx = task->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
-        mutex_lock(&ctx->mutex);
+                mutex_lock(&ctx->mutex);
 again:
-        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+                list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
-                perf_free_event(event, ctx);
+                                group_entry)
+                        perf_free_event(event, ctx);
-        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+                list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                                 group_entry)
+                                group_entry)
-                perf_free_event(event, ctx);
+                        perf_free_event(event, ctx);
-        if (!list_empty(&ctx->pinned_groups) ||
+                if (!list_empty(&ctx->pinned_groups) ||
-            !list_empty(&ctx->flexible_groups))
+                                !list_empty(&ctx->flexible_groups))
-                goto again;
+                        goto again;
-        mutex_unlock(&ctx->mutex);
+                mutex_unlock(&ctx->mutex);
-        put_ctx(ctx);
+                put_ctx(ctx);
+        }
+}
+void perf_event_delayed_put(struct task_struct *task)
+{
+        int ctxn;
+        for_each_task_context_nr(ctxn)
+                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+              struct task_struct *parent,
+              struct perf_event_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_event *group_leader,
+              struct perf_event_context *child_ctx)
+{
+        struct perf_event *child_event;
+        unsigned long flags;
+        /*
+         * Instead of creating recursive hierarchies of events,
+         * we link inherited events back to the original parent,
+         * which has a filp for sure, which we use as the reference
+         * count:
+         */
+        if (parent_event->parent)
+                parent_event = parent_event->parent;
+        child_event = perf_event_alloc(&parent_event->attr,
+                                           parent_event->cpu,
+                                           child,
+                                           group_leader, parent_event,
+                                           NULL);
+        if (IS_ERR(child_event))
+                return child_event;
+        get_ctx(child_ctx);
+        /*
+         * Make the child state follow the state of the parent event,
+         * not its attr.disabled bit.  We hold the parent's mutex,
+         * so we won't race with perf_event_{en, dis}able_family.
+         */
+        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+                child_event->state = PERF_EVENT_STATE_INACTIVE;
+        else
+                child_event->state = PERF_EVENT_STATE_OFF;
+        if (parent_event->attr.freq) {
+                u64 sample_period = parent_event->hw.sample_period;
+                struct hw_perf_event *hwc = &child_event->hw;
+                hwc->sample_period = sample_period;
+                hwc->last_period   = sample_period;
+                local64_set(&hwc->period_left, sample_period);
+        }
+        child_event->ctx = child_ctx;
+        child_event->overflow_handler = parent_event->overflow_handler;
+        /*
+         * Link it up in the child's context:
+         */
+        raw_spin_lock_irqsave(&child_ctx->lock, flags);
+        add_event_to_ctx(child_event, child_ctx);
+        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+        /*
+         * Get a reference to the parent filp - we will fput it
+         * when the child event exits. This is safe to do because
+         * we are in the parent and we know that the filp still
+         * exists and has a nonzero count:
+         */
+        atomic_long_inc(&parent_event->filp->f_count);
+        /*
+         * Link this into the parent event's child list
+         */
+        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+        mutex_lock(&parent_event->child_mutex);
+        list_add_tail(&child_event->child_list, &parent_event->child_list);
+        mutex_unlock(&parent_event->child_mutex);
+        return child_event;
+}
+static int inherit_group(struct perf_event *parent_event,
+              struct task_struct *parent,
+              struct perf_event_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_event_context *child_ctx)
+{
+        struct perf_event *leader;
+        struct perf_event *sub;
+        struct perf_event *child_ctr;
+        leader = inherit_event(parent_event, parent, parent_ctx,
+                                 child, NULL, child_ctx);
+        if (IS_ERR(leader))
+                return PTR_ERR(leader);
+        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+                child_ctr = inherit_event(sub, parent, parent_ctx,
+                                            child, leader, child_ctx);
+                if (IS_ERR(child_ctr))
+                        return PTR_ERR(child_ctr);
+        }
+        return 0;
 }
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
-                   struct task_struct *child,
+                   struct task_struct *child, int ctxn,
                   int *inherited_all)
 {
        int ret;
-        struct perf_event_context *child_ctx = child->perf_event_ctxp;
+        struct perf_event_context *child_ctx;
        if (!event->attr.inherit) {
                *inherited_all = 0;
                return 0;
        }
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
@@ -5560,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                 * child.
                 */
-                child_ctx = kzalloc(sizeof(struct perf_event_context),
+                child_ctx = alloc_perf_context(event->pmu, child);
-                                    GFP_KERNEL);
                if (!child_ctx)
                        return -ENOMEM;
-                __perf_event_init_context(child_ctx, child);
+                child->perf_event_ctxp[ctxn] = child_ctx;
-                child->perf_event_ctxp = child_ctx;
-                get_task_struct(child);
        }
        ret = inherit_group(event, parent, parent_ctx,
@@ -5579,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
        return ret;
 }
 /*
 * Initialize the perf_event context in task_struct
 */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
@@ -5592,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child)
        int inherited_all = 1;
        int ret = 0;
-        child->perf_event_ctxp = NULL;
+        child->perf_event_ctxp[ctxn] = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
-        if (likely(!parent->perf_event_ctxp))
+        if (likely(!parent->perf_event_ctxp[ctxn]))
                return 0;
        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
-        parent_ctx = perf_pin_task_context(parent);
+        parent_ctx = perf_pin_task_context(parent, ctxn);
        /*
         * No need to check if parent_ctx != NULL here; since we saw
@@ -5624,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child)
         * the list, not manipulating it:
         */
        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-                ret = inherit_task_group(event, parent, parent_ctx, child,
+                ret = inherit_task_group(event, parent, parent_ctx,
-                                         &inherited_all);
+                                         child, ctxn, &inherited_all);
                if (ret)
                        break;
        }
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-                ret = inherit_task_group(event, parent, parent_ctx, child,
+                ret = inherit_task_group(event, parent, parent_ctx,
-                                         &inherited_all);
+                                         child, ctxn, &inherited_all);
                if (ret)
                        break;
        }
-        child_ctx = child->perf_event_ctxp;
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (child_ctx && inherited_all) {
                /*
@@ -5666,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child)
        return ret;
 }
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+        int ctxn, ret;
+        for_each_task_context_nr(ctxn) {
+                ret = perf_event_init_context(child, ctxn);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
 static void __init perf_event_init_all_cpus(void)
 {
+        struct swevent_htable *swhash;
        int cpu;
-        struct perf_cpu_context *cpuctx;
        for_each_possible_cpu(cpu) {
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                swhash = &per_cpu(swevent_htable, cpu);
-                mutex_init(&cpuctx->hlist_mutex);
+                mutex_init(&swhash->hlist_mutex);
-                __perf_event_init_context(&cpuctx->ctx, NULL);
+                INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
        }
 }
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
-        struct perf_cpu_context *cpuctx;
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        cpuctx = &per_cpu(perf_cpu_context, cpu);
-        spin_lock(&perf_resource_lock);
+        mutex_lock(&swhash->hlist_mutex);
-        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
+        if (swhash->hlist_refcount > 0) {
-        spin_unlock(&perf_resource_lock);
-        mutex_lock(&cpuctx->hlist_mutex);
-        if (cpuctx->hlist_refcount > 0) {
                struct swevent_hlist *hlist;
-                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
-                WARN_ON_ONCE(!hlist);
+                WARN_ON(!hlist);
-                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_event_exit_cpu(void *info)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-        struct perf_event_context *ctx = &cpuctx->ctx;
+        WARN_ON(!irqs_disabled());
+        list_del_init(&cpuctx->rotation_list);
+}
+static void __perf_event_exit_context(void *__info)
+{
+        struct perf_event_context *ctx = __info;
        struct perf_event *event, *tmp;
+        perf_pmu_rotate_stop(ctx->pmu);
        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
                __perf_event_remove_from_context(event);
        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                __perf_event_remove_from_context(event);
 }
+static void perf_event_exit_cpu_context(int cpu)
+{
+        struct perf_event_context *ctx;
+        struct pmu *pmu;
+        int idx;
+        idx = srcu_read_lock(&pmus_srcu);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+                mutex_lock(&ctx->mutex);
+                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+                mutex_unlock(&ctx->mutex);
+        }
+        srcu_read_unlock(&pmus_srcu, idx);
+}
 static void perf_event_exit_cpu(int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        struct perf_event_context *ctx = &cpuctx->ctx;
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        swevent_hlist_release(cpuctx);
+        swevent_hlist_release(swhash);
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
-        mutex_lock(&ctx->mutex);
+        perf_event_exit_cpu_context(cpu);
-        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-        mutex_unlock(&ctx->mutex);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
@@ -5733,15 +6274,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
        unsigned int cpu = (long)hcpu;
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
                perf_event_init_cpu(cpu);
                break;
+        case CPU_UP_CANCELED:
        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
                perf_event_exit_cpu(cpu);
                break;
@@ -5752,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-        .notifier_call          = perf_cpu_notify,
-        .priority               = 20,
-};
 void __init perf_event_init(void)
 {
        perf_event_init_all_cpus();
-        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+        init_srcu_struct(&pmus_srcu);
-                        (void *)(long)smp_processor_id());
+        perf_pmu_register(&perf_swevent);
-        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+        perf_pmu_register(&perf_cpu_clock);
-                        (void *)(long)smp_processor_id());
+        perf_pmu_register(&perf_task_clock);
-        register_cpu_notifier(&perf_cpu_nb);
+        perf_tp_register();
-}
+        perf_cpu_notifier(perf_cpu_notify);
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-                                        struct sysdev_class_attribute *attr,
-                                        char *buf)
-{
-        return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-                        struct sysdev_class_attribute *attr,
-                        const char *buf,
-                        size_t count)
-{
-        struct perf_cpu_context *cpuctx;
-        unsigned long val;
-        int err, cpu, mpt;
-        err = strict_strtoul(buf, 10, &val);
-        if (err)
-                return err;
-        if (val > perf_max_events)
-                return -EINVAL;
-        spin_lock(&perf_resource_lock);
-        perf_reserved_percpu = val;
-        for_each_online_cpu(cpu) {
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
-                raw_spin_lock_irq(&cpuctx->ctx.lock);
-                mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-                          perf_max_events - perf_reserved_percpu);
-                cpuctx->max_pertask = mpt;
-                raw_spin_unlock_irq(&cpuctx->ctx.lock);
-        }
-        spin_unlock(&perf_resource_lock);
-        return count;
-}
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
-                                    struct sysdev_class_attribute *attr,
-                                    char *buf)
-{
-        return sprintf(buf, "%d\n", perf_overcommit);
-}
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class,
-                    struct sysdev_class_attribute *attr,
-                    const char *buf, size_t count)
-{
-        unsigned long val;
-        int err;
-        err = strict_strtoul(buf, 10, &val);
-        if (err)
-                return err;
-        if (val > 1)
-                return -EINVAL;
-        spin_lock(&perf_resource_lock);
-        perf_overcommit = val;
-        spin_unlock(&perf_resource_lock);
-        return count;
-}
-static SYSDEV_CLASS_ATTR(
-                                reserve_percpu,
-                                0644,
-                                perf_show_reserve_percpu,
-                                perf_set_reserve_percpu
-                        );
-static SYSDEV_CLASS_ATTR(
-                                overcommit,
-                                0644,
-                                perf_show_overcommit,
-                                perf_set_overcommit
-                        );
-static struct attribute *perfclass_attrs[] = {
-        &attr_reserve_percpu.attr,
-        &attr_overcommit.attr,
-        NULL
-};
-static struct attribute_group perfclass_attr_group = {
-        .attrs                  = perfclass_attrs,
-        .name                   = "perf_events",
-};
-static int __init perf_event_sysfs_init(void)
-{
-        return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-                                  &perfclass_attr_group);
 }
-device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c132d26..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
        atomic_inc(&map->nr_free);
 }
+/*
+ * If we started walking pids at 'base', is 'a' seen before 'b'?
+ */
+static int pid_before(int base, int a, int b)
+{
+        /*
+         * This is the same as saying
+         *
+         * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
+         * and that mapping orders 'a' and 'b' with respect to 'base'.
+         */
+        return (unsigned)(a - base) < (unsigned)(b - base);
+}
+/*
+ * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We want the winner to have the "later" value, because if the
+ * "earlier" value prevails, then a pid may get reused immediately.
+ *
+ * Since pids rollover, it is not sufficient to just pick the bigger
+ * value.  We have to consider where we started counting from.
+ *
+ * 'base' is the value of pid_ns->last_pid that we observed when
+ * we started looking for a pid.
+ *
+ * 'pid' is the pid that we eventually found.
+ */
+static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
+{
+        int prev;
+        int last_write = base;
+        do {
+                prev = last_write;
+                last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
+        } while ((prev != last_write) && (pid_before(base, last_write, pid)));
+}
 static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
        int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                pid = RESERVED_PIDS;
        offset = pid & BITS_PER_PAGE_MASK;
        map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-        max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+        /*
+         * If last_pid points into the middle of the map->page we
+         * want to scan this bitmap block twice, the second time
+         * we start with offset == 0 (or RESERVED_PIDS).
+         */
+        max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
        for (i = 0; i <= max_scan; ++i) {
                if (unlikely(!map->page)) {
                        void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                        do {
                                if (!test_and_set_bit(offset, map->page)) {
                                        atomic_dec(&map->nr_free);
-                                        pid_ns->last_pid = pid;
+                                        set_last_pid(pid_ns, last, pid);
                                        return pid;
                                }
                                offset = find_next_offset(map, offset);
                                pid = mk_pid(pid_ns, map, offset);
-                        /*
+                        } while (offset < BITS_PER_PAGE && pid < pid_max);
-                         * find_next_offset() found a bit, the pid from it
-                         * is in-bounds, and if we fell back to the last
-                         * bitmap block and the final block was the same
-                         * as the starting point, pid is before last_pid.
-                         */
-                        } while (offset < BITS_PER_PAGE && pid < pid_max &&
-                                        (i != max_scan || pid < last ||
-                                            !((last+1) & BITS_PER_PAGE_MASK)));
                }
                if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
                        ++map;
@@ -367,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference_check(pid->tasks[type].first,
+                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              rcu_read_lock_held() ||
                                              lockdep_tasklist_lock_is_held());
                if (first)
@@ -382,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
+        rcu_lockdep_assert(rcu_read_lock_held());
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..c7a8f453919e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
-struct pm_qos_request_list {
+enum pm_qos_type {
-        struct list_head list;
+        PM_QOS_MAX,             /* return the largest value */
-        union {
+        PM_QOS_MIN              /* return the smallest value */
-                s32 value;
-                s32 usec;
-                s32 kbps;
-        };
-        int pm_qos_class;
 };
-static s32 max_compare(s32 v1, s32 v2);
-static s32 min_compare(s32 v1, s32 v2);
 struct pm_qos_object {
-        struct pm_qos_request_list requests;
+        struct plist_head requests;
        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
        s32 default_value;
-        atomic_t target_value;
+        enum pm_qos_type type;
-        s32 (*comparitor)(s32, s32);
 };
+static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-        .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
+        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
+        .type = PM_QOS_MIN,
-        .comparitor = min_compare
 };
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-        .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
+        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
+        .type = PM_QOS_MIN
-        .comparitor = min_compare
 };
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-        .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
+        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
        .default_value = 0,
-        .target_value = ATOMIC_INIT(0),
+        .type = PM_QOS_MAX,
-        .comparitor = max_compare
 };
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
        &network_throughput_pm_qos
 };
-static DEFINE_SPINLOCK(pm_qos_lock);
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos);
 static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -122,48 +110,58 @@ static const struct file_operations pm_qos_power_fops = {
        .write = pm_qos_power_write,
        .open = pm_qos_power_open,
        .release = pm_qos_power_release,
+        .llseek = noop_llseek,
 };
-/* static helper functions */
+/* unlocked internal variant */
-static s32 max_compare(s32 v1, s32 v2)
+static inline int pm_qos_get_value(struct pm_qos_object *o)
 {
-        return max(v1, v2);
+        if (plist_head_empty(&o->requests))
-}
+                return o->default_value;
-static s32 min_compare(s32 v1, s32 v2)
+        switch (o->type) {
-{
+        case PM_QOS_MIN:
-        return min(v1, v2);
+                return plist_last(&o->requests)->prio;
-}
+        case PM_QOS_MAX:
+                return plist_first(&o->requests)->prio;
+        default:
+                /* runtime check for not using enum */
+                BUG();
+        }
+}
-static void update_target(int pm_qos_class)
+static void update_target(struct pm_qos_object *o, struct plist_node *node,
+                          int del, int value)
 {
-        s32 extreme_value;
-        struct pm_qos_request_list *node;
        unsigned long flags;
-        int call_notifier = 0;
+        int prev_value, curr_value;
        spin_lock_irqsave(&pm_qos_lock, flags);
-        extreme_value = pm_qos_array[pm_qos_class]->default_value;
+        prev_value = pm_qos_get_value(o);
-        list_for_each_entry(node,
+        /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
-                        &pm_qos_array[pm_qos_class]->requests.list, list) {
+        if (value != PM_QOS_DEFAULT_VALUE) {
-                extreme_value = pm_qos_array[pm_qos_class]->comparitor(
+                /*
-                                extreme_value, node->value);
+                 * to change the list, we atomically remove, reinit
-        }
+                 * with new value and add, then see if the extremal
-        if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
+                 * changed
-                        extreme_value) {
+                 */
-                call_notifier = 1;
+                plist_del(node, &o->requests);
-                atomic_set(&pm_qos_array[pm_qos_class]->target_value,
+                plist_node_init(node, value);
-                                extreme_value);
+                plist_add(node, &o->requests);
-                pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
+        } else if (del) {
-                        atomic_read(&pm_qos_array[pm_qos_class]->target_value));
+                plist_del(node, &o->requests);
+        } else {
+                plist_add(node, &o->requests);
        }
+        curr_value = pm_qos_get_value(o);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        if (call_notifier)
+        if (prev_value != curr_value)
-                blocking_notifier_call_chain(
+                blocking_notifier_call_chain(o->notifiers,
-                                pm_qos_array[pm_qos_class]->notifiers,
+                                             (unsigned long)curr_value,
-                                        (unsigned long) extreme_value, NULL);
+                                             NULL);
 }
 static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,42 +194,53 @@ static int find_pm_qos_object_by_minor(int minor)
 */
 int pm_qos_request(int pm_qos_class)
 {
-        return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
+        unsigned long flags;
+        int value;
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return value;
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+        return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
 /**
 * pm_qos_add_request - inserts new qos request into the list
- * @pm_qos_class: identifies which list of qos request to us
+ * @dep: pointer to a preallocated handle
+ * @pm_qos_class: identifies which list of qos request to use
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
 * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters, and returns the pm_qos_request list
+ * for the pm_qos_class of parameters and initializes the pm_qos_request_list
- * element as a handle for use in updating and removal.  Call needs to save
+ * handle.  Caller needs to save this handle for later use in updates and
- * this handle for later use.
+ * removal.
 */
-struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+                        int pm_qos_class, s32 value)
 {
-        struct pm_qos_request_list *dep;
+        struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
-        unsigned long flags;
+        int new_value;
-        dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
+        if (pm_qos_request_active(dep)) {
-        if (dep) {
+                WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
-                if (value == PM_QOS_DEFAULT_VALUE)
+                return;
-                        dep->value = pm_qos_array[pm_qos_class]->default_value;
-                else
-                        dep->value = value;
-                dep->pm_qos_class = pm_qos_class;
-                spin_lock_irqsave(&pm_qos_lock, flags);
-                list_add(&dep->list,
-                        &pm_qos_array[pm_qos_class]->requests.list);
-                spin_unlock_irqrestore(&pm_qos_lock, flags);
-                update_target(pm_qos_class);
        }
+        if (value == PM_QOS_DEFAULT_VALUE)
-        return dep;
+                new_value = o->default_value;
+        else
+                new_value = value;
+        plist_node_init(&dep->list, new_value);
+        dep->pm_qos_class = pm_qos_class;
+        update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
@@ -246,27 +255,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
 * Attempts are made to make this code callable on hot code paths.
 */
 void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
-                s32 new_value)
+                           s32 new_value)
 {
-        unsigned long flags;
-        int pending_update = 0;
        s32 temp;
+        struct pm_qos_object *o;
+        if (!pm_qos_req) /*guard against callers passing in null */
+                return;
-        if (pm_qos_req) { /*guard against callers passing in null */
+        if (!pm_qos_request_active(pm_qos_req)) {
-                spin_lock_irqsave(&pm_qos_lock, flags);
+                WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
-                if (new_value == PM_QOS_DEFAULT_VALUE)
+                return;
-                        temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
-                else
-                        temp = new_value;
-                if (temp != pm_qos_req->value) {
-                        pending_update = 1;
-                        pm_qos_req->value = temp;
-                }
-                spin_unlock_irqrestore(&pm_qos_lock, flags);
-                if (pending_update)
-                        update_target(pm_qos_req->pm_qos_class);
        }
+        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        if (new_value == PM_QOS_DEFAULT_VALUE)
+                temp = o->default_value;
+        else
+                temp = new_value;
+        if (temp != pm_qos_req->list.prio)
+                update_target(o, &pm_qos_req->list, 0, temp);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
@@ -280,19 +290,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
 */
 void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
 {
-        unsigned long flags;
+        struct pm_qos_object *o;
-        int qos_class;
        if (pm_qos_req == NULL)
                return;
                /* silent return to keep pcm code cleaner */
-        qos_class = pm_qos_req->pm_qos_class;
+        if (!pm_qos_request_active(pm_qos_req)) {
-        spin_lock_irqsave(&pm_qos_lock, flags);
+                WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
-        list_del(&pm_qos_req->list);
+                return;
-        kfree(pm_qos_req);
+        }
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        update_target(qos_class);
+        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+        memset(pm_qos_req, 0, sizeof(*pm_qos_req));
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -340,8 +351,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
-                filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
+               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
-                                PM_QOS_DEFAULT_VALUE);
+                if (!req)
+                        return -ENOMEM;
+                pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+                filp->private_data = req;
                if (filp->private_data)
                        return 0;
@@ -353,8 +368,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
        struct pm_qos_request_list *req;
-        req = (struct pm_qos_request_list *)filp->private_data;
+        req = filp->private_data;
        pm_qos_remove_request(req);
+        kfree(req);
        return 0;
 }
@@ -374,14 +390,16 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
        } else if (count == 11) { /* len('0x12345678/0') */
                if (copy_from_user(ascii_value, buf, 11))
                        return -EFAULT;
+                if (strlen(ascii_value) != 10)
+                        return -EINVAL;
                x = sscanf(ascii_value, "%x", &value);
                if (x != 1)
                        return -EINVAL;
-                pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
+                pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
        } else
                return -EINVAL;
-        pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
+        pm_qos_req = filp->private_data;
        pm_qos_update_request(pm_qos_req, value);
        return count;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
 * siglock protection since other code may update expiration cache as
 * well.
 */
-void update_rlimit_cpu(unsigned long rlim_new)
+void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
        cputime_t cputime = secs_to_cputime(rlim_new);
-        spin_lock_irq(&current->sighand->siglock);
+        spin_lock_irq(&task->sighand->siglock);
-        set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+        set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
-        spin_unlock_irq(&current->sighand->siglock);
+        spin_unlock_irq(&task->sighand->siglock);
 }
 static int check_clock(const clockid_t which_clock)
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
-        struct sighand_struct *sighand;
+        struct signal_struct *sig = tsk->signal;
-        struct signal_struct *sig;
        struct task_struct *t;
-        *times = INIT_CPUTIME;
+        times->utime = sig->utime;
+        times->stime = sig->stime;
+        times->sum_exec_runtime = sig->sum_sched_runtime;
        rcu_read_lock();
-        sighand = rcu_dereference(tsk->sighand);
+        /* make sure we can trust tsk->thread_group list */
-        if (!sighand)
+        if (!likely(pid_alive(tsk)))
                goto out;
-        sig = tsk->signal;
        t = tsk;
        do {
                times->utime = cputime_add(times->utime, t->utime);
                times->stime = cputime_add(times->stime, t->stime);
                times->sum_exec_runtime += t->se.sum_exec_runtime;
+        } while_each_thread(tsk, t);
-                t = next_thread(t);
-        } while (t != tsk);
-        times->utime = cputime_add(times->utime, sig->utime);
-        times->stime = cputime_add(times->stime, sig->stime);
-        times->sum_exec_runtime += sig->sum_sched_runtime;
 out:
        rcu_read_unlock();
 }
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 {
        struct signal_struct *sig;
-        /* tsk == current, ensure it is safe to use ->signal/sighand */
-        if (unlikely(tsk->exit_state))
-                return 0;
        if (!task_cputime_zero(&tsk->cputime_expires)) {
                struct task_cputime task_sample = {
                        .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        if (sig->cputimer.running) {
                struct task_cputime group_sample;
-                thread_group_cputimer(tsk, &group_sample);
+                spin_lock(&sig->cputimer.lock);
+                group_sample = sig->cputimer.cputime;
+                spin_unlock(&sig->cputimer.lock);
                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                        return 1;
        }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 {
        LIST_HEAD(firing);
        struct k_itimer *timer, *next;
+        unsigned long flags;
        BUG_ON(!irqs_disabled());
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
        if (!fastpath_timer_check(tsk))
                return;
-        spin_lock(&tsk->sighand->siglock);
+        if (!lock_task_sighand(tsk, &flags))
+                return;
        /*
         * Here we take off tsk->signal->cpu_timers[N] and
         * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
         * that gets the timer lock before we do will give it up and
         * spin until we've taken care of that timer below.
         */
-        spin_unlock(&tsk->sighand->siglock);
+        unlock_task_sighand(tsk, &flags);
        /*
         * Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
        new_timer->it_clock = which_clock;
        new_timer->it_overrun = -1;
-        if (copy_to_user(created_timer_id,
-                         &new_timer_id, sizeof (new_timer_id))) {
-                error = -EFAULT;
-                goto out;
-        }
        if (timer_event_spec) {
                if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
                        error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
        new_timer->sigq->info.si_tid   = new_timer->it_id;
        new_timer->sigq->info.si_code  = SI_TIMER;
+        if (copy_to_user(created_timer_id,
+                         &new_timer_id, sizeof (new_timer_id))) {
+                error = -EFAULT;
+                goto out;
+        }
        error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
        if (error)
                goto out;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..29bff6117abc 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP
        depends on SMP
        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
        depends on PM_SLEEP
+        select HOTPLUG
        select HOTPLUG_CPU
        default y
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+        select LZO_COMPRESS
+        select LZO_DECOMPRESS
        select SUSPEND_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
@@ -242,3 +245,17 @@ config PM_OPS
        bool
        depends on PM_SLEEP || PM_RUNTIME
        default y
+config PM_OPP
+        bool "Operating Performance Point (OPP) Layer library"
+        depends on PM
+        ---help---
+          SOCs have a standard set of tuples consisting of frequency and
+          voltage pairs that the device will support per voltage domain. This
+          is called Operating Performance Point or OPP. The actual definitions
+          of OPP varies over silicon within the same family of devices.
+          OPP layer organizes the data internally using device pointers
+          representing individual voltage domains and provides SOC
+          implementations a ready to use framework to manage OPPs.
+          For more information, read <file:Documentation/power/opp.txt>
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 97024fd40cd5..83bbc7c02df9 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
                struct page *page, struct bio **bio_chain)
 {
-        const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+        const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
        struct bio *bio;
        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..657272e91d0a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
- * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
 *
 * This file is released under the GPLv2.
@@ -29,6 +29,7 @@
 #include "power.h"
+static int nocompress = 0;
 static int noresume = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
@@ -277,7 +278,7 @@ static int create_image(int platform_mode)
                goto Enable_irqs;
        }
-        if (hibernation_test(TEST_CORE))
+        if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
                goto Power_up;
        in_suspend = 1;
@@ -288,8 +289,10 @@ static int create_image(int platform_mode)
                        error);
        /* Restore control flow magically appears here */
        restore_processor_state();
-        if (!in_suspend)
+        if (!in_suspend) {
+                events_check_enabled = false;
                platform_leave(platform_mode);
+        }
 Power_up:
        sysdev_resume();
@@ -328,7 +331,7 @@ int hibernation_snapshot(int platform_mode)
        error = platform_begin(platform_mode);
        if (error)
-                return error;
+                goto Close;
        /* Preallocate image memory before shutting down devices. */
        error = hibernate_preallocate_memory();
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void)
        local_irq_disable();
        sysdev_suspend(PMSG_HIBERNATE);
+        if (!pm_check_wakeup_events()) {
+                error = -EAGAIN;
+                goto Power_up;
+        }
        hibernation_ops->enter();
        /* We should never get here */
        while (1);
-        /*
+ Power_up:
-         * We don't need to reenable the nonboot CPUs or resume consoles, since
+        sysdev_resume();
-         * the system is going to be halted anyway.
+        local_irq_enable();
-         */
+        enable_nonboot_cpus();
 Platform_finish:
        hibernation_ops->finish();
-        dpm_suspend_noirq(PMSG_RESTORE);
+        dpm_resume_noirq(PMSG_RESTORE);
 Resume_devices:
        entering_platform_hibernation = false;
@@ -630,6 +639,8 @@ int hibernate(void)
                if (hibernation_mode == HIBERNATION_PLATFORM)
                        flags |= SF_PLATFORM_MODE;
+                if (nocompress)
+                        flags |= SF_NOCOMPRESS_MODE;
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
@@ -697,7 +708,7 @@ static int software_resume(void)
                goto Unlock;
        }
-        pr_debug("PM: Checking image partition %s\n", resume_file);
+        pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
@@ -722,10 +733,10 @@ static int software_resume(void)
        }
 Check_image:
-        pr_debug("PM: Resume from partition %d:%d\n",
+        pr_debug("PM: Hibernation image partition %d:%d present\n",
                MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
-        pr_debug("PM: Checking hibernation image.\n");
+        pr_debug("PM: Looking for hibernation image.\n");
        error = swsusp_check();
        if (error)
                goto Unlock;
@@ -757,14 +768,14 @@ static int software_resume(void)
                goto Done;
        }
-        pr_debug("PM: Reading hibernation image.\n");
+        pr_debug("PM: Loading hibernation image.\n");
        error = swsusp_read(&flags);
        swsusp_close(FMODE_READ);
        if (!error)
                hibernation_restore(flags & SF_PLATFORM_MODE);
-        printk(KERN_ERR "PM: Restore failed, recovering.\n");
+        printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
        swsusp_free();
        thaw_processes();
 Done:
@@ -777,7 +788,7 @@ static int software_resume(void)
        /* For success case, the suspend path will release the lock */
 Unlock:
        mutex_unlock(&pm_mutex);
-        pr_debug("PM: Resume from disk failed.\n");
+        pr_debug("PM: Hibernation image not present or could not be loaded.\n");
        return error;
 close_finish:
        swsusp_close(FMODE_READ);
@@ -996,6 +1007,15 @@ static int __init resume_offset_setup(char *str)
        return 1;
 }
+static int __init hibernate_setup(char *str)
+{
+        if (!strncmp(str, "noresume", 8))
+                noresume = 1;
+        else if (!strncmp(str, "nocompress", 10))
+                nocompress = 1;
+        return 1;
+}
 static int __init noresume_setup(char *str)
 {
        noresume = 1;
@@ -1005,3 +1025,4 @@ static int __init noresume_setup(char *str)
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
+__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..7b5db6a8561e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 power_attr(state);
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up.  In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted.  Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event.  Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event.  In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'.  It first should read from 'wakeup_count' and store
+ * the read value.  Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'.  If that fails, at least one wakeup event has occured since
+ * 'wakeup_count' was read and 'state' should not be written to.  Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+static ssize_t wakeup_count_show(struct kobject *kobj,
+                                struct kobj_attribute *attr,
+                                char *buf)
+{
+        unsigned int val;
+        return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
+}
+static ssize_t wakeup_count_store(struct kobject *kobj,
+                                struct kobj_attribute *attr,
+                                const char *buf, size_t n)
+{
+        unsigned int val;
+        if (sscanf(buf, "%u", &val) == 1) {
+                if (pm_save_wakeup_count(val))
+                        return n;
+        }
+        return -EINVAL;
+}
+power_attr(wakeup_count);
+#endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
@@ -227,15 +281,34 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 power_attr(pm_trace);
+static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       char *buf)
+{
+        return show_trace_dev_match(buf, PAGE_SIZE);
+}
+static ssize_t
+pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
+                         const char *buf, size_t n)
+{
+        return -EINVAL;
+}
+power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
+        &pm_trace_dev_match_attr.attr,
 #endif
 #ifdef CONFIG_PM_SLEEP
        &pm_async_attr.attr,
+        &wakeup_count_attr.attr,
 #ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
@@ -253,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
 static int __init pm_start_workqueue(void)
 {
-        pm_wq = create_freezeable_workqueue("pm");
+        pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
        return pm_wq ? 0 : -ENOMEM;
 }
@@ -266,6 +339,7 @@ static int __init pm_init(void)
        int error = pm_start_workqueue();
        if (error)
                return error;
+        hibernate_image_size_init();
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..03634be55f62 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,9 @@ struct swsusp_info {
 } __attribute__((aligned(PAGE_SIZE)));
 #ifdef CONFIG_HIBERNATION
+/* kernel/power/snapshot.c */
+extern void __init hibernate_image_size_init(void);
 #ifdef CONFIG_ARCH_HIBERNATION_HEADER
 /* Maximum size of architecture specific data in a hibernation header */
 #define MAX_ARCH_HEADER_SIZE    (sizeof(struct new_utsname) + 4)
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
-#endif
+#else /* !CONFIG_HIBERNATION */
+static inline void hibernate_image_size_init(void) {}
+#endif /* !CONFIG_HIBERNATION */
 extern int pfn_is_nosave(unsigned long);
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void);
 * the image header.
 */
 #define SF_PLATFORM_MODE        1
+#define SF_NOCOMPRESS_MODE      2
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index e8b337006276..d52359374e85 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy)
 static DECLARE_WORK(poweroff_work, do_poweroff);
-static void handle_poweroff(int key, struct tty_struct *tty)
+static void handle_poweroff(int key)
 {
        /* run sysrq poweroff on boot cpu */
        schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..e50b4c1b2a0f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
+#include <linux/workqueue.h>
 /* 
 * Timeout for stopping processes
@@ -35,13 +36,19 @@ static int try_to_freeze_tasks(bool sig_only)
        struct task_struct *g, *p;
        unsigned long end_time;
        unsigned int todo;
+        bool wq_busy = false;
        struct timeval start, end;
        u64 elapsed_csecs64;
        unsigned int elapsed_csecs;
+        bool wakeup = false;
        do_gettimeofday(&start);
        end_time = jiffies + TIMEOUT;
+        if (!sig_only)
+                freeze_workqueues_begin();
        while (true) {
                todo = 0;
                read_lock(&tasklist_lock);
@@ -63,9 +70,20 @@ static int try_to_freeze_tasks(bool sig_only)
                                todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
+                if (!sig_only) {
+                        wq_busy = freeze_workqueues_busy();
+                        todo += wq_busy;
+                }
                if (!todo || time_after(jiffies, end_time))
                        break;
+                if (!pm_check_wakeup_events()) {
+                        wakeup = true;
+                        break;
+                }
                /*
                 * We need to retry, but first give the freezing tasks some
                 * time to enter the regrigerator.
@@ -85,13 +103,18 @@ static int try_to_freeze_tasks(bool sig_only)
                 * but it cleans up leftover PF_FREEZE requests.
                 */
                printk("\n");
-                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
+                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
-                                "(%d tasks refusing to freeze):\n",
+                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
-                                elapsed_csecs / 100, elapsed_csecs % 100, todo);
+                       wakeup ? "aborted" : "failed",
+                       elapsed_csecs / 100, elapsed_csecs % 100,
+                       todo - wq_busy, wq_busy);
+                thaw_workqueues();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
-                        if (freezing(p) && !freezer_should_skip(p))
+                        if (!wakeup && freezing(p) && !freezer_should_skip(p))
                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
@@ -157,6 +180,7 @@ void thaw_processes(void)
        oom_killer_enable();
        printk("Restarting tasks ... ");
+        thaw_workqueues();
        thaw_tasks(true);
        thaw_tasks(false);
        schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
 *
 * This file provides system snapshot/restore functionality for swsusp.
 *
- * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
 *
 * This file is released under the GPLv2.
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *);
 * size will not exceed N bytes, but if that is impossible, it will
 * try to create the smallest image possible.
 */
-unsigned long image_size = 500 * 1024 * 1024;
+unsigned long image_size;
+void __init hibernate_image_size_init(void)
+{
+        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+}
 /* List of PBEs needed for restoring the pages that were allocated before
 * the suspend and included in the suspend image, but have also been
@@ -979,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                src = kmap_atomic(s_page, KM_USER0);
                dst = kmap_atomic(d_page, KM_USER1);
                do_copy_page(dst, src);
-                kunmap_atomic(src, KM_USER0);
                kunmap_atomic(dst, KM_USER1);
+                kunmap_atomic(src, KM_USER0);
        } else {
                if (PageHighMem(d_page)) {
                        /* Page pointed to by src may contain some kernel
@@ -988,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                         */
                        safe_copy_page(buffer, s_page);
                        dst = kmap_atomic(d_page, KM_USER0);
-                        memcpy(dst, buffer, PAGE_SIZE);
+                        copy_page(dst, buffer);
                        kunmap_atomic(dst, KM_USER0);
                } else {
                        safe_copy_page(page_address(d_page), s_page);
@@ -1121,9 +1126,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
        return nr_alloc;
 }
-static unsigned long preallocate_image_memory(unsigned long nr_pages)
+static unsigned long preallocate_image_memory(unsigned long nr_pages,
+                                              unsigned long avail_normal)
 {
-        return preallocate_image_pages(nr_pages, GFP_IMAGE);
+        unsigned long alloc;
+        if (avail_normal <= alloc_normal)
+                return 0;
+        alloc = avail_normal - alloc_normal;
+        if (nr_pages < alloc)
+                alloc = nr_pages;
+        return preallocate_image_pages(alloc, GFP_IMAGE);
 }
 #ifdef CONFIG_HIGHMEM
@@ -1169,15 +1184,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 */
 static void free_unnecessary_pages(void)
 {
-        unsigned long save_highmem, to_free_normal, to_free_highmem;
+        unsigned long save, to_free_normal, to_free_highmem;
-        to_free_normal = alloc_normal - count_data_pages();
+        save = count_data_pages();
-        save_highmem = count_highmem_pages();
+        if (alloc_normal >= save) {
-        if (alloc_highmem > save_highmem) {
+                to_free_normal = alloc_normal - save;
-                to_free_highmem = alloc_highmem - save_highmem;
+                save = 0;
+        } else {
+                to_free_normal = 0;
+                save -= alloc_normal;
+        }
+        save += count_highmem_pages();
+        if (alloc_highmem >= save) {
+                to_free_highmem = alloc_highmem - save;
        } else {
                to_free_highmem = 0;
-                to_free_normal -= save_highmem - alloc_highmem;
+                to_free_normal -= save - alloc_highmem;
        }
        memory_bm_position_reset(&copy_bm);
@@ -1258,7 +1280,7 @@ int hibernate_preallocate_memory(void)
 {
        struct zone *zone;
        unsigned long saveable, size, max_size, count, highmem, pages = 0;
-        unsigned long alloc, save_highmem, pages_highmem;
+        unsigned long alloc, save_highmem, pages_highmem, avail_normal;
        struct timeval start, stop;
        int error;
@@ -1295,26 +1317,38 @@ int hibernate_preallocate_memory(void)
                else
                        count += zone_page_state(zone, NR_FREE_PAGES);
        }
+        avail_normal = count;
        count += highmem;
        count -= totalreserve_pages;
        /* Compute the maximum number of saveable pages to leave in memory. */
        max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+        /* Compute the desired number of image pages specified by image_size. */
        size = DIV_ROUND_UP(image_size, PAGE_SIZE);
        if (size > max_size)
                size = max_size;
        /*
-         * If the maximum is not less than the current number of saveable pages
+         * If the desired number of image pages is at least as large as the
-         * in memory, allocate page frames for the image and we're done.
+         * current number of saveable pages in memory, allocate page frames for
+         * the image and we're done.
         */
        if (size >= saveable) {
                pages = preallocate_image_highmem(save_highmem);
-                pages += preallocate_image_memory(saveable - pages);
+                pages += preallocate_image_memory(saveable - pages, avail_normal);
                goto out;
        }
        /* Estimate the minimum size of the image. */
        pages = minimum_image_size(saveable);
+        /*
+         * To avoid excessive pressure on the normal zone, leave room in it to
+         * accommodate an image of the minimum size (unless it's already too
+         * small, in which case don't preallocate pages from it at all).
+         */
+        if (avail_normal > pages)
+                avail_normal -= pages;
+        else
+                avail_normal = 0;
        if (size < pages)
                size = min_t(unsigned long, pages, max_size);
@@ -1335,16 +1369,34 @@ int hibernate_preallocate_memory(void)
         */
        pages_highmem = preallocate_image_highmem(highmem / 2);
        alloc = (count - max_size) - pages_highmem;
-        pages = preallocate_image_memory(alloc);
+        pages = preallocate_image_memory(alloc, avail_normal);
-        if (pages < alloc)
+        if (pages < alloc) {
-                goto err_out;
+                /* We have exhausted non-highmem pages, try highmem. */
-        size = max_size - size;
+                alloc -= pages;
-        alloc = size;
+                pages += pages_highmem;
-        size = preallocate_highmem_fraction(size, highmem, count);
+                pages_highmem = preallocate_image_highmem(alloc);
-        pages_highmem += size;
+                if (pages_highmem < alloc)
-        alloc -= size;
+                        goto err_out;
-        pages += preallocate_image_memory(alloc);
+                pages += pages_highmem;
-        pages += pages_highmem;
+                /*
+                 * size is the desired number of saveable pages to leave in
+                 * memory, so try to preallocate (all memory - size) pages.
+                 */
+                alloc = (count - pages) - size;
+                pages += preallocate_image_highmem(alloc);
+        } else {
+                /*
+                 * There are approximately max_size saveable pages at this point
+                 * and we want to reduce this number down to size.
+                 */
+                alloc = max_size - size;
+                size = preallocate_highmem_fraction(alloc, highmem, count);
+                pages_highmem += size;
+                alloc -= size;
+                size = preallocate_image_memory(alloc, avail_normal);
+                pages_highmem += preallocate_image_highmem(alloc - size);
+                pages += pages_highmem + size;
+        }
        /*
         * We only need as many page frames for the image as there are saveable
@@ -1635,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
                memory_bm_position_reset(&orig_bm);
                memory_bm_position_reset(&copy_bm);
        } else if (handle->cur <= nr_meta_pages) {
-                memset(buffer, 0, PAGE_SIZE);
+                clear_page(buffer);
                pack_pfns(buffer, &orig_bm);
        } else {
                struct page *page;
@@ -1649,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
                        void *kaddr;
                        kaddr = kmap_atomic(page, KM_USER0);
-                        memcpy(buffer, kaddr, PAGE_SIZE);
+                        copy_page(buffer, kaddr);
                        kunmap_atomic(kaddr, KM_USER0);
                        handle->buffer = buffer;
                } else {
@@ -1932,7 +1984,7 @@ static void copy_last_highmem_page(void)
                void *dst;
                dst = kmap_atomic(last_highmem_page, KM_USER0);
-                memcpy(dst, buffer, PAGE_SIZE);
+                copy_page(dst, buffer);
                kunmap_atomic(dst, KM_USER0);
                last_highmem_page = NULL;
        }
@@ -2218,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
        kaddr1 = kmap_atomic(p1, KM_USER0);
        kaddr2 = kmap_atomic(p2, KM_USER1);
-        memcpy(buf, kaddr1, PAGE_SIZE);
+        copy_page(buf, kaddr1);
-        memcpy(kaddr1, kaddr2, PAGE_SIZE);
+        copy_page(kaddr1, kaddr2);
-        memcpy(kaddr2, buf, PAGE_SIZE);
+        copy_page(kaddr2, buf);
-        kunmap_atomic(kaddr1, KM_USER0);
        kunmap_atomic(kaddr2, KM_USER1);
+        kunmap_atomic(kaddr1, KM_USER0);
 }
 /**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
        if (suspend_ops->prepare) {
                error = suspend_ops->prepare();
                if (error)
-                        return error;
+                        goto Platform_finish;
        }
        error = dpm_suspend_noirq(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
-                goto Platfrom_finish;
+                goto Platform_finish;
        }
        if (suspend_ops->prepare_late) {
                error = suspend_ops->prepare_late();
                if (error)
-                        goto Power_up_devices;
+                        goto Platform_wake;
        }
        if (suspend_test(TEST_PLATFORM))
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
        error = sysdev_suspend(PMSG_SUSPEND);
        if (!error) {
-                if (!suspend_test(TEST_CORE))
+                if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
                        error = suspend_ops->enter(state);
+                        events_check_enabled = false;
+                }
                sysdev_resume();
        }
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
        if (suspend_ops->wake)
                suspend_ops->wake();
- Power_up_devices:
        dpm_resume_noirq(PMSG_RESUME);
- Platfrom_finish:
+ Platform_finish:
        if (suspend_ops->finish)
                suspend_ops->finish();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
 * This file provides functions for reading the suspend image from
 * and writing it to a swap partition.
 *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
 *
 * This file is released under the GPLv2.
@@ -24,15 +24,17 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/slab.h>
+#include <linux/lzo.h>
+#include <linux/vmalloc.h>
 #include "power.h"
-#define SWSUSP_SIG      "S1SUSPEND"
+#define HIBERNATE_SIG   "LINHIB0001"
 /*
 *      The swap map is a data structure used for keeping track of each page
 *      written to a swap partition.  It consists of many swap_map_page
- *      structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *      structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
 *      These structures are stored on the swap and linked together with the
 *      help of the .next_swap member.
 *
@@ -148,7 +150,7 @@ sector_t alloc_swapdev_block(int swap)
 /**
 *      free_all_swap_pages - free swap pages allocated for saving image data.
- *      It also frees the extents used to register which swap entres had been
+ *      It also frees the extents used to register which swap entries had been
 *      allocated.
 */
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
        if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
                memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
-                memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
+                memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
                error = hib_bio_write_page(swsusp_resume_block,
@@ -249,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
        if (bio_chain) {
                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
                if (src) {
-                        memcpy(src, buf, PAGE_SIZE);
+                        copy_page(src, buf);
                } else {
                        WARN_ON_ONCE(1);
                        bio_chain = NULL;       /* Go synchronous */
@@ -323,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                error = write_page(handle->cur, handle->cur_swap, NULL);
                if (error)
                        goto out;
-                memset(handle->cur, 0, PAGE_SIZE);
+                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
        }
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
        return error;
 }
+/* We need to remember how much compressed data we need to read. */
+#define LZO_HEADER      sizeof(size_t)
+/* Number of pages/bytes we'll compress at one time. */
+#define LZO_UNC_PAGES   32
+#define LZO_UNC_SIZE    (LZO_UNC_PAGES * PAGE_SIZE)
+/* Number of pages/bytes we need for compressed data (worst case). */
+#define LZO_CMP_PAGES   DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
+                                     LZO_HEADER, PAGE_SIZE)
+#define LZO_CMP_SIZE    (LZO_CMP_PAGES * PAGE_SIZE)
 /**
 *      save_image - save the suspend image data
 */
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle,
        return ret;
 }
+/**
+ * save_image_lzo - Save the suspend image data compressed with LZO.
+ * @handle: Swap mam handle to use for saving the image.
+ * @snapshot: Image to read data from.
+ * @nr_to_write: Number of pages to save.
+ */
+static int save_image_lzo(struct swap_map_handle *handle,
+                          struct snapshot_handle *snapshot,
+                          unsigned int nr_to_write)
+{
+        unsigned int m;
+        int ret = 0;
+        int nr_pages;
+        int err2;
+        struct bio *bio;
+        struct timeval start;
+        struct timeval stop;
+        size_t off, unc_len, cmp_len;
+        unsigned char *unc, *cmp, *wrk, *page;
+        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                return -ENOMEM;
+        }
+        wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
+        if (!wrk) {
+                printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        unc = vmalloc(LZO_UNC_SIZE);
+        if (!unc) {
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                vfree(wrk);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        cmp = vmalloc(LZO_CMP_SIZE);
+        if (!cmp) {
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                vfree(unc);
+                vfree(wrk);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        printk(KERN_INFO
+                "PM: Compressing and saving image data (%u pages) ...     ",
+                nr_to_write);
+        m = nr_to_write / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        bio = NULL;
+        do_gettimeofday(&start);
+        for (;;) {
+                for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+                        ret = snapshot_read_next(snapshot);
+                        if (ret < 0)
+                                goto out_finish;
+                        if (!ret)
+                                break;
+                        memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+                        if (!(nr_pages % m))
+                                printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+                if (!off)
+                        break;
+                unc_len = off;
+                ret = lzo1x_1_compress(unc, unc_len,
+                                       cmp + LZO_HEADER, &cmp_len, wrk);
+                if (ret < 0) {
+                        printk(KERN_ERR "PM: LZO compression failed\n");
+                        break;
+                }
+                if (unlikely(!cmp_len ||
+                             cmp_len > lzo1x_worst_compress(unc_len))) {
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        ret = -1;
+                        break;
+                }
+                *(size_t *)cmp = cmp_len;
+                /*
+                 * Given we are writing one page at a time to disk, we copy
+                 * that much from the buffer, although the last bit will likely
+                 * be smaller than full page. This is OK - we saved the length
+                 * of the compressed data, so any garbage at the end will be
+                 * discarded when we read it.
+                 */
+                for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                        memcpy(page, cmp + off, PAGE_SIZE);
+                        ret = swap_write_page(handle, page, &bio);
+                        if (ret)
+                                goto out_finish;
+                }
+        }
+out_finish:
+        err2 = hib_wait_on_bio_chain(&bio);
+        do_gettimeofday(&stop);
+        if (!ret)
+                ret = err2;
+        if (!ret)
+                printk(KERN_CONT "\b\b\b\bdone\n");
+        else
+                printk(KERN_CONT "\n");
+        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+        vfree(cmp);
+        vfree(unc);
+        vfree(wrk);
+        free_page((unsigned long)page);
+        return ret;
+}
 /**
 *      enough_swap - Make sure we have enough swap to save the image.
 *
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle,
 *      space avaiable from the resume partition.
 */
-static int enough_swap(unsigned int nr_pages)
+static int enough_swap(unsigned int nr_pages, unsigned int flags)
 {
        unsigned int free_swap = count_swap_pages(root_swap, 1);
+        unsigned int required;
        pr_debug("PM: Free swap pages: %u\n", free_swap);
-        return free_swap > nr_pages + PAGES_FOR_IO;
+        required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
+                nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+        return free_swap > required;
 }
 /**
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags)
                printk(KERN_ERR "PM: Cannot get swap writer\n");
                return error;
        }
-        if (!enough_swap(pages)) {
+        if (!enough_swap(pages, flags)) {
                printk(KERN_ERR "PM: Not enough free swap\n");
                error = -ENOSPC;
                goto out_finish;
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags)
        }
        header = (struct swsusp_info *)data_of(snapshot);
        error = swap_write_page(&handle, header, NULL);
-        if (!error)
+        if (!error) {
-                error = save_image(&handle, &snapshot, pages - 1);
+                error = (flags & SF_NOCOMPRESS_MODE) ?
+                        save_image(&handle, &snapshot, pages - 1) :
+                        save_image_lzo(&handle, &snapshot, pages - 1);
+        }
 out_finish:
        error = swap_writer_finish(&handle, flags, error);
        return error;
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle,
 }
 /**
+ * load_image_lzo - Load compressed image data and decompress them with LZO.
+ * @handle: Swap map handle to use for loading data.
+ * @snapshot: Image to copy uncompressed data into.
+ * @nr_to_read: Number of pages to load.
+ */
+static int load_image_lzo(struct swap_map_handle *handle,
+                          struct snapshot_handle *snapshot,
+                          unsigned int nr_to_read)
+{
+        unsigned int m;
+        int error = 0;
+        struct timeval start;
+        struct timeval stop;
+        unsigned nr_pages;
+        size_t off, unc_len, cmp_len;
+        unsigned char *unc, *cmp, *page;
+        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                return -ENOMEM;
+        }
+        unc = vmalloc(LZO_UNC_SIZE);
+        if (!unc) {
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        cmp = vmalloc(LZO_CMP_SIZE);
+        if (!cmp) {
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                vfree(unc);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        printk(KERN_INFO
+                "PM: Loading and decompressing image data (%u pages) ...     ",
+                nr_to_read);
+        m = nr_to_read / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        do_gettimeofday(&start);
+        error = snapshot_write_next(snapshot);
+        if (error <= 0)
+                goto out_finish;
+        for (;;) {
+                error = swap_read_page(handle, page, NULL); /* sync */
+                if (error)
+                        break;
+                cmp_len = *(size_t *)page;
+                if (unlikely(!cmp_len ||
+                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        error = -1;
+                        break;
+                }
+                memcpy(cmp, page, PAGE_SIZE);
+                for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                        error = swap_read_page(handle, page, NULL); /* sync */
+                        if (error)
+                                goto out_finish;
+                        memcpy(cmp + off, page, PAGE_SIZE);
+                }
+                unc_len = LZO_UNC_SIZE;
+                error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
+                                              unc, &unc_len);
+                if (error < 0) {
+                        printk(KERN_ERR "PM: LZO decompression failed\n");
+                        break;
+                }
+                if (unlikely(!unc_len ||
+                             unc_len > LZO_UNC_SIZE ||
+                             unc_len & (PAGE_SIZE - 1))) {
+                        printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
+                        error = -1;
+                        break;
+                }
+                for (off = 0; off < unc_len; off += PAGE_SIZE) {
+                        memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                        error = snapshot_write_next(snapshot);
+                        if (error <= 0)
+                                goto out_finish;
+                }
+        }
+out_finish:
+        do_gettimeofday(&stop);
+        if (!error) {
+                printk("\b\b\b\bdone\n");
+                snapshot_write_finalize(snapshot);
+                if (!snapshot_image_loaded(snapshot))
+                        error = -ENODATA;
+        } else
+                printk("\n");
+        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+        vfree(cmp);
+        vfree(unc);
+        free_page((unsigned long)page);
+        return error;
+}
+/**
 *      swsusp_read - read the hibernation image.
 *      @flags_p: flags passed by the "frozen" kernel in the image header should
 *                be written into this memeory location
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p)
                goto end;
        if (!error)
                error = swap_read_page(&handle, header, NULL);
-        if (!error)
+        if (!error) {
-                error = load_image(&handle, &snapshot, header->pages - 1);
+                error = (*flags_p & SF_NOCOMPRESS_MODE) ?
+                        load_image(&handle, &snapshot, header->pages - 1) :
+                        load_image_lzo(&handle, &snapshot, header->pages - 1);
+        }
        swap_reader_finish(&handle);
 end:
        if (!error)
@@ -634,13 +910,13 @@ int swsusp_check(void)
        hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
        if (!IS_ERR(hib_resume_bdev)) {
                set_blocksize(hib_resume_bdev, PAGE_SIZE);
-                memset(swsusp_header, 0, PAGE_SIZE);
+                clear_page(swsusp_header);
                error = hib_bio_read_page(swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
                        goto put;
-                if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
+                if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
                        memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
                        /* Reset swap signature now */
                        error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +929,13 @@ put:
                if (error)
                        blkdev_put(hib_resume_bdev, FMODE_READ);
                else
-                        pr_debug("PM: Signature found, resuming\n");
+                        pr_debug("PM: Image signature found, resuming\n");
        } else {
                error = PTR_ERR(hib_resume_bdev);
        }
        if (error)
-                pr_debug("PM: Error %d checking image file\n", error);
+                pr_debug("PM: Image not found (code %d)\n", error);
        return error;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..b2ebaee8c377 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
 #include <linux/syslog.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
 #include <asm/uaccess.h>
@@ -83,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
 * provides serialisation for access to the entire console
 * driver system.
 */
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
@@ -208,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
 #ifdef CONFIG_BOOT_PRINTK_DELAY
-static unsigned int boot_delay; /* msecs delay after each printk during bootup */
+static int boot_delay; /* msecs delay after each printk during bootup */
 static unsigned long long loops_per_msec;       /* based on boot_delay */
 static int __init boot_delay_setup(char *str)
@@ -554,7 +556,7 @@ static void zap_locks(void)
        /* If a crash is occurring, make sure we can't deadlock */
        spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
-        init_MUTEX(&console_sem);
+        sema_init(&console_sem, 1);
 }
 #if defined(CONFIG_PRINTK_TIME)
@@ -645,6 +647,7 @@ static inline int can_use_console(unsigned int cpu)
 * released but interrupts still disabled.
 */
 static int acquire_console_semaphore_for_printk(unsigned int cpu)
+        __releases(&logbuf_lock)
 {
        int retval = 0;
@@ -985,6 +988,32 @@ void resume_console(void)
 }
 /**
+ * console_cpu_notify - print deferred console messages after CPU hotplug
+ * @self: notifier struct
+ * @action: CPU hotplug event
+ * @hcpu: unused
+ *
+ * If printk() is called from a CPU that is not online yet, the messages
+ * will be spooled but will not show up on the console.  This function is
+ * called when a new CPU comes online (or fails to come up), and ensures
+ * that any such output gets printed.
+ */
+static int __cpuinit console_cpu_notify(struct notifier_block *self,
+        unsigned long action, void *hcpu)
+{
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_DEAD:
+        case CPU_DYING:
+        case CPU_DOWN_FAILED:
+        case CPU_UP_CANCELED:
+                acquire_console_sem();
+                release_console_sem();
+        }
+        return NOTIFY_OK;
+}
+/**
 * acquire_console_sem - lock the console system for exclusive use.
 *
 * Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1400,7 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
-static int __init disable_boot_consoles(void)
+static int __init printk_late_init(void)
 {
        struct console *con;
@@ -1382,9 +1411,10 @@ static int __init disable_boot_consoles(void)
                        unregister_console(con);
                }
        }
+        hotcpu_notifier(console_cpu_notify, 0);
        return 0;
 }
-late_initcall(disable_boot_consoles);
+late_initcall(printk_late_init);
 #if defined CONFIG_PRINTK
@@ -1482,7 +1512,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char const *kmsg_reasons[] = {
+static const char * const kmsg_reasons[] = {
        [KMSG_DUMP_OOPS]        = "oops",
        [KMSG_DUMP_PANIC]       = "panic",
        [KMSG_DUMP_KEXEC]       = "kexec",
@@ -1520,9 +1550,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
        chars = logged_chars;
        spin_unlock_irqrestore(&logbuf_lock, flags);
-        if (logged_chars > end) {
+        if (chars > end) {
-                s1 = log_buf + log_buf_len - logged_chars + end;
+                s1 = log_buf + log_buf_len - chars + end;
-                l1 = logged_chars - end;
+                l1 = chars - end;
                s2 = log_buf;
                l2 = end;
@@ -1530,8 +1560,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
                s1 = "";
                l1 = 0;
-                s2 = log_buf + end - logged_chars;
+                s2 = log_buf + end - chars;
-                l2 = logged_chars;
+                l2 = chars;
        }
        if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 static const struct file_operations proc_profile_operations = {
        .read           = read_profile,
        .write          = write_profile,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 74a3d693c196..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
         * under ptrace.
         */
        retval = -ERESTARTNOINTR;
-        if (mutex_lock_interruptible(&task->cred_guard_mutex))
+        if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
                goto out;
        task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
 unlock_tasklist:
        write_unlock_irq(&tasklist_lock);
 unlock_creds:
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
 out:
        return retval;
 }
@@ -324,26 +324,34 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 }
 /*
- * Detach all tasks we were using ptrace on.
+ * Detach all tasks we were using ptrace on. Called with tasklist held
+ * for writing, and returns with it held too. But note it can release
+ * and reacquire the lock.
 */
 void exit_ptrace(struct task_struct *tracer)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct task_struct *p, *n;
        LIST_HEAD(ptrace_dead);
-        write_lock_irq(&tasklist_lock);
+        if (likely(list_empty(&tracer->ptraced)))
+                return;
        list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
                if (__ptrace_detach(tracer, p))
                        list_add(&p->ptrace_entry, &ptrace_dead);
        }
-        write_unlock_irq(&tasklist_lock);
+        write_unlock_irq(&tasklist_lock);
        BUG_ON(!list_empty(&tracer->ptraced));
        list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }
+        write_lock_irq(&tasklist_lock);
 }
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
@@ -396,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
        return copied;
 }
-static int ptrace_setoptions(struct task_struct *child, long data)
+static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
        child->ptrace &= ~PT_TRACE_MASK;
@@ -475,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
 #define is_sysemu_singlestep(request)   0
 #endif
-static int ptrace_resume(struct task_struct *child, long request, long data)
+static int ptrace_resume(struct task_struct *child, long request,
+                         unsigned long data)
 {
        if (!valid_signal(data))
                return -EIO;
@@ -552,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 #endif
 int ptrace_request(struct task_struct *child, long request,
-                   long addr, long data)
+                   unsigned long addr, unsigned long data)
 {
        int ret = -EIO;
        siginfo_t siginfo;
+        void __user *datavp = (void __user *) data;
+        unsigned long __user *datalp = datavp;
        switch (request) {
        case PTRACE_PEEKTEXT:
@@ -572,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
                ret = ptrace_setoptions(child, data);
                break;
        case PTRACE_GETEVENTMSG:
-                ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+                ret = put_user(child->ptrace_message, datalp);
                break;
        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
-                        ret = copy_siginfo_to_user((siginfo_t __user *) data,
+                        ret = copy_siginfo_to_user(datavp, &siginfo);
-                                                   &siginfo);
                break;
        case PTRACE_SETSIGINFO:
-                if (copy_from_user(&siginfo, (siginfo_t __user *) data,
+                if (copy_from_user(&siginfo, datavp, sizeof siginfo))
-                                   sizeof siginfo))
                        ret = -EFAULT;
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
@@ -615,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
                }
                mmput(mm);
-                ret = put_user(tmp, (unsigned long __user *) data);
+                ret = put_user(tmp, datalp);
                break;
        }
 #endif
@@ -644,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
        case PTRACE_SETREGSET:
        {
                struct iovec kiov;
-                struct iovec __user *uiov = (struct iovec __user *) data;
+                struct iovec __user *uiov = datavp;
                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
                        return -EFAULT;
@@ -685,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
 #define arch_ptrace_attach(child)       do { } while (0)
 #endif
-SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
+                unsigned long, data)
 {
        struct task_struct *child;
        long ret;
@@ -726,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
        return ret;
 }
-int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
+                            unsigned long data)
 {
        unsigned long tmp;
        int copied;
@@ -737,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
        return put_user(tmp, (unsigned long __user *)data);
 }
-int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
+                            unsigned long data)
 {
        int copied;
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
 #include <linux/range.h>
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
 int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
 {
        if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 /**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
 *
 * Check for bottom half being disabled, which covers both the
 * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation.  This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
 *
 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
-        return in_softirq();
+        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -114,3 +116,163 @@ int rcu_my_thread_group_empty(void)
 }
 EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
 #endif /* #ifdef CONFIG_PROVE_RCU */
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static inline void debug_init_rcu_head(struct rcu_head *head)
+{
+        debug_object_init(head, &rcuhead_debug_descr);
+}
+static inline void debug_rcu_head_free(struct rcu_head *head)
+{
+        debug_object_free(head, &rcuhead_debug_descr);
+}
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct rcu_head *head = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                /*
+                 * Ensure that queued callbacks are all executed.
+                 * If we detect that we are nested in a RCU read-side critical
+                 * section, we should simply fail, otherwise we would deadlock.
+                 */
+                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+                    irqs_disabled()) {
+                        WARN_ON(1);
+                        return 0;
+                }
+                rcu_barrier();
+                rcu_barrier_sched();
+                rcu_barrier_bh();
+                debug_object_init(head, &rcuhead_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct rcu_head *head = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. We just make sure that it is
+                 * tracked in the object tracker.
+                 */
+                debug_object_init(head, &rcuhead_debug_descr);
+                debug_object_activate(head, &rcuhead_debug_descr);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                /*
+                 * Ensure that queued callbacks are all executed.
+                 * If we detect that we are nested in a RCU read-side critical
+                 * section, we should simply fail, otherwise we would deadlock.
+                 */
+                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+                    irqs_disabled()) {
+                        WARN_ON(1);
+                        return 0;
+                }
+                rcu_barrier();
+                rcu_barrier_sched();
+                rcu_barrier_bh();
+                debug_object_activate(head, &rcuhead_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct rcu_head *head = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                /*
+                 * Ensure that queued callbacks are all executed.
+                 * If we detect that we are nested in a RCU read-side critical
+                 * section, we should simply fail, otherwise we would deadlock.
+                 */
+#ifndef CONFIG_PREEMPT
+                WARN_ON(1);
+                return 0;
+#else
+                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+                    irqs_disabled()) {
+                        WARN_ON(1);
+                        return 0;
+                }
+                rcu_barrier();
+                rcu_barrier_sched();
+                rcu_barrier_bh();
+                debug_object_free(head, &rcuhead_debug_descr);
+                return 1;
+#endif
+        default:
+                return 0;
+        }
+}
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack.  This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap.  This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+        debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope.  As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap.  Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+        debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+struct debug_obj_descr rcuhead_debug_descr = {
+        .name = "rcu_head",
+        .fixup_init = rcuhead_fixup_init,
+        .fixup_activate = rcuhead_fixup_activate,
+        .fixup_free = rcuhead_fixup_free,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Forward declarations for rcutiny_plugin.h. */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __call_rcu(struct rcu_head *head,
+                       void (*func)(struct rcu_head *rcu),
+                       struct rcu_ctrlblk *rcp);
+#include "rcutiny_plugin.h"
 #ifdef CONFIG_NO_HZ
 static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
+        rcu_preempt_check_callbacks();
 }
 /*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        *rcp->donetail = NULL;
        if (rcp->curtail == rcp->donetail)
                rcp->curtail = &rcp->rcucblist;
+        rcu_preempt_remove_callbacks(rcp);
        rcp->donetail = &rcp->rcucblist;
        local_irq_restore(flags);
@@ -169,6 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        while (list) {
                next = list->next;
                prefetch(next);
+                debug_rcu_head_unqueue(list);
                list->func(list);
                list = next;
        }
@@ -181,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 {
        __rcu_process_callbacks(&rcu_sched_ctrlblk);
        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        rcu_preempt_process_callbacks();
 }
 /*
@@ -211,6 +223,7 @@ static void __call_rcu(struct rcu_head *head,
 {
        unsigned long flags;
+        debug_rcu_head_queue(head);
        head->func = func;
        head->next = NULL;
@@ -221,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
 }
 /*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
 * period.  But since we have but one CPU, that would be after any
 * quiescent state.
 */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
        __call_rcu(head, func, &rcu_sched_ctrlblk);
 }
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -241,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
 void rcu_barrier_bh(void)
 {
        struct rcu_synchronize rcu;
@@ -287,5 +286,3 @@ void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
-#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
 * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
 *
 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#ifdef CONFIG_TINY_PREEMPT_RCU
+#include <linux/delay.h>
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+        struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+        struct rcu_head **nexttail;
+                                /* Tasks blocked in a preemptible RCU */
+                                /*  read-side critical section while an */
+                                /*  preemptible-RCU grace period is in */
+                                /*  progress must wait for a later grace */
+                                /*  period.  This pointer points to the */
+                                /*  ->next pointer of the last task that */
+                                /*  must wait for a later grace period, or */
+                                /*  to &->rcb.rcucblist if there is no */
+                                /*  such task. */
+        struct list_head blkd_tasks;
+                                /* Tasks blocked in RCU read-side critical */
+                                /*  section.  Tasks are placed at the head */
+                                /*  of this list and age towards the tail. */
+        struct list_head *gp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current grace period, or NULL if there */
+                                /*  is not such task. */
+        struct list_head *exp_tasks;
+                                /* Pointer to first task blocking the */
+                                /*  current expedited grace period, or NULL */
+                                /*  if there is no such task.  If there */
+                                /*  is no current expedited grace period, */
+                                /*  then there cannot be any such task. */
+        u8 gpnum;               /* Current grace period. */
+        u8 gpcpu;               /* Last grace period blocked by the CPU. */
+        u8 completed;           /* Last grace period completed. */
+                                /*  If all three are equal, RCU is idle. */
+};
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+        .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+        return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+/*
+ * Check for a running RCU reader.  Because there is only one CPU,
+ * there can be but one running RCU reader at a time.  ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+        return current->rcu_read_lock_nesting;
+}
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+        return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+        return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+        return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+        return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state.  The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time.  Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period.  (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+        /* Record both CPU and task as having responded to current GP. */
+        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        /*
+         * If there is no GP, or if blocked readers are still blocking GP,
+         * then there is nothing more to do.
+         */
+        if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+                return;
+        /* Advance callbacks. */
+        rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+        rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+        rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+        /* If there are no blocked readers, next GP is done instantly. */
+        if (!rcu_preempt_blocked_readers_any())
+                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+        /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+                raise_softirq(RCU_SOFTIRQ);
+}
+/*
+ * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+        if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+                /* Official start of GP. */
+                rcu_preempt_ctrlblk.gpnum++;
+                /* Any blocked RCU readers block new GP. */
+                if (rcu_preempt_blocked_readers_any())
+                        rcu_preempt_ctrlblk.gp_tasks =
+                                rcu_preempt_ctrlblk.blkd_tasks.next;
+                /* If there is no running reader, CPU is done with GP. */
+                if (!rcu_preempt_running_reader())
+                        rcu_preempt_cpu_qs();
+        }
+}
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section.  Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+        struct task_struct *t = current;
+        unsigned long flags;
+        local_irq_save(flags); /* must exclude scheduler_tick(). */
+        if (rcu_preempt_running_reader() &&
+            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+                /* Possibly blocking in an RCU read-side critical section. */
+                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+                /*
+                 * If this CPU has already checked in, then this task
+                 * will hold up the next grace period rather than the
+                 * current grace period.  Queue the task accordingly.
+                 * If the task is queued for the current grace period
+                 * (i.e., this CPU has not yet passed through a quiescent
+                 * state for the current grace period), then as long
+                 * as that task remains queued, the current grace period
+                 * cannot end.
+                 */
+                list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+                if (rcu_cpu_blocking_cur_gp())
+                        rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+        }
+        /*
+         * Either we were not in an RCU read-side critical section to
+         * begin with, or we have now recorded that critical section
+         * globally.  Either way, we can now note a quiescent state
+         * for this CPU.  Again, if we were in an RCU read-side critical
+         * section, and if that critical section was blocking the current
+         * grace period, then the fact that the task has been enqueued
+         * means that current grace period continues to be blocked.
+         */
+        rcu_preempt_cpu_qs();
+        local_irq_restore(flags);
+}
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+        int empty;
+        int empty_exp;
+        unsigned long flags;
+        struct list_head *np;
+        int special;
+        /*
+         * NMI handlers cannot block and cannot safely manipulate state.
+         * They therefore cannot possibly be special, so just leave.
+         */
+        if (in_nmi())
+                return;
+        local_irq_save(flags);
+        /*
+         * If RCU core is waiting for this CPU to exit critical section,
+         * let it know that we have done so.
+         */
+        special = t->rcu_read_unlock_special;
+        if (special & RCU_READ_UNLOCK_NEED_QS)
+                rcu_preempt_cpu_qs();
+        /* Hardware IRQ handlers cannot block. */
+        if (in_irq()) {
+                local_irq_restore(flags);
+                return;
+        }
+        /* Clean up if blocked during RCU read-side critical section. */
+        if (special & RCU_READ_UNLOCK_BLOCKED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+                /*
+                 * Remove this task from the ->blkd_tasks list and adjust
+                 * any pointers that might have been referencing it.
+                 */
+                empty = !rcu_preempt_blocked_readers_cgp();
+                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+                np = t->rcu_node_entry.next;
+                if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                        np = NULL;
+                list_del(&t->rcu_node_entry);
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+                        rcu_preempt_ctrlblk.gp_tasks = np;
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+                        rcu_preempt_ctrlblk.exp_tasks = np;
+                INIT_LIST_HEAD(&t->rcu_node_entry);
+                /*
+                 * If this was the last task on the current list, and if
+                 * we aren't waiting on the CPU, report the quiescent state
+                 * and start a new grace period if needed.
+                 */
+                if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+                        rcu_preempt_cpu_qs();
+                        rcu_preempt_start_gp();
+                }
+                /*
+                 * If this was the last task on the expedited lists,
+                 * then we need wake up the waiting task.
+                 */
+                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+                        rcu_report_exp_done();
+        }
+        local_irq_restore(flags);
+}
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+        --t->rcu_read_lock_nesting;
+        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+        if (t->rcu_read_lock_nesting == 0 &&
+            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere.  This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+        struct task_struct *t = current;
+        if (rcu_preempt_gp_in_progress() &&
+            (!rcu_preempt_running_reader() ||
+             !rcu_cpu_blocking_cur_gp()))
+                rcu_preempt_cpu_qs();
+        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+            rcu_preempt_ctrlblk.rcb.donetail)
+                raise_softirq(RCU_SOFTIRQ);
+        if (rcu_preempt_gp_in_progress() &&
+            rcu_cpu_blocking_cur_gp() &&
+            rcu_preempt_running_reader())
+                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from __rcu_process_callbacks() to
+ * handle that case.  Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+        if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+                rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        unsigned long flags;
+        debug_rcu_head_queue(head);
+        head->func = func;
+        head->next = NULL;
+        local_irq_save(flags);
+        *rcu_preempt_ctrlblk.nexttail = head;
+        rcu_preempt_ctrlblk.nexttail = &head->next;
+        rcu_preempt_start_gp();  /* checks to see if GP needed. */
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+void rcu_barrier(void)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!rcu_scheduler_active)
+                return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+        WARN_ON_ONCE(rcu_preempt_running_reader());
+        if (!rcu_preempt_blocked_readers_any())
+                return;
+        /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+        rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+        return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+        wake_up(&sync_rcu_preempt_exp_wq);
+}
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section.  Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list.  So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+        unsigned long flags;
+        struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+        unsigned long snap;
+        barrier(); /* ensure prior action seen before grace period. */
+        WARN_ON_ONCE(rcu_preempt_running_reader());
+        /*
+         * Acquire lock so that there is only one preemptible RCU grace
+         * period in flight.  Of course, if someone does the expedited
+         * grace period for us while we are acquiring the lock, just leave.
+         */
+        snap = sync_rcu_preempt_exp_count + 1;
+        mutex_lock(&sync_rcu_preempt_exp_mutex);
+        if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+                goto unlock_mb_ret; /* Others did our work for us. */
+        local_irq_save(flags);
+        /*
+         * All RCU readers have to already be on blkd_tasks because
+         * we cannot legally be executing in an RCU read-side critical
+         * section.
+         */
+        /* Snapshot current head of ->blkd_tasks list. */
+        rpcp->exp_tasks = rpcp->blkd_tasks.next;
+        if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+                rpcp->exp_tasks = NULL;
+        local_irq_restore(flags);
+        /* Wait for tail of ->blkd_tasks list to drain. */
+        if (rcu_preempted_readers_exp())
+                wait_event(sync_rcu_preempt_exp_wq,
+                           !rcu_preempted_readers_exp());
+        /* Clean up and exit. */
+        barrier(); /* ensure expedited GP seen before counter increment. */
+        sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+        mutex_unlock(&sync_rcu_preempt_exp_mutex);
+        barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+        if (!rcu_preempt_running_reader())
+                rcu_preempt_cpu_qs();
+        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting == 0)
+                return;
+        t->rcu_read_lock_nesting = 1;
+        rcu_read_unlock();
+}
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
 };
 static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
 static long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #define FULLSTOP_SHUTDOWN 1     /* System shutdown with rcutorture running. */
 #define FULLSTOP_RMMOD    2     /* Normal rmmod of rcutorture. */
 static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex);   /* Protect fullstop transitions and spawning */
+/*
-                                /*  of kthreads. */
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
 /*
 * Detect and respond to a system shutdown.
@@ -239,8 +241,7 @@ static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
        if (--rrsp->rrs_count < 0) {
-                rrsp->rrs_state +=
+                rrsp->rrs_state += (unsigned long)local_clock();
-                        (unsigned long)cpu_clock(raw_smp_processor_id());
                rrsp->rrs_count = RCU_RANDOM_REFRESH;
        }
        rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
@@ -304,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
                mdelay(longdelay_ms);
        if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
                udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+                preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
 }
 static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -537,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
        delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
        if (!delay)
                schedule_timeout_interruptible(longdelay);
+        else
+                rcu_read_delay(rrsp);
 }
 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -732,7 +739,8 @@ rcu_torture_writer(void *arg)
                        continue;
                rp->rtort_pipe_count = 0;
                udelay(rcu_random(&rand) & 0x3ff);
-                old_rp = rcu_torture_current;
+                old_rp = rcu_dereference_check(rcu_torture_current,
+                                               current == writer_task);
                rp->rtort_mbtest = 1;
                rcu_assign_pointer(rcu_torture_current, rp);
                smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        /* OK, time to rat on our buddy... */
+        /*
+         * OK, time to rat on our buddy...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
               rsp->name);
        rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        /*
+         * OK, time to rat on ourselves...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
        trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        long delta;
        struct rcu_node *rnp;
-        if (rcu_cpu_stall_panicking)
+        if (rcu_cpu_stall_suppress)
                return;
-        delta = jiffies - rsp->jiffies_stall;
+        delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
        rnp = rdp->mynode;
-        if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+        if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
-        rcu_cpu_stall_panicking = 1;
+        rcu_cpu_stall_suppress = 1;
        return NOTIFY_DONE;
 }
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+        rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_preempt_stall_reset();
+}
 static struct notifier_block rcu_panic_block = {
        .notifier_call = rcu_panic,
 };
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
+void rcu_cpu_stall_reset(void)
+{
+}
 static void __init check_cpu_stall_init(void)
 {
 }
@@ -712,7 +745,7 @@ static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
 {
        int i;
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
        rsp->orphan_qlen += rdp->qlen;
+        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->qlen = 0;
        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        struct rcu_data *rdp;
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        rdp = rsp->rda[smp_processor_id()];
+        rdp = this_cpu_ptr(rsp->rda);
        if (rsp->orphan_cbs_list == NULL) {
                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
        rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
        rdp->qlen += rsp->orphan_qlen;
+        rdp->n_cbs_adopted += rsp->orphan_qlen;
        rsp->orphan_cbs_list = NULL;
        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
        rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        unsigned long flags;
        unsigned long mask;
        int need_report = 0;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp;
        /* Exclude any attempts to start a new grace period. */
@@ -1112,6 +1147,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        while (list) {
                next = list->next;
                prefetch(next);
+                debug_rcu_head_unqueue(list);
                list->func(list);
                list = next;
                if (++count >= rdp->blimit)
@@ -1122,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
+        rdp->n_cbs_invoked += count;
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
@@ -1225,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                cpu = rnp->grplo;
                bit = 1;
                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-                        if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+                        if ((rnp->qsmask & bit) != 0 &&
+                            f(per_cpu_ptr(rsp->rda, cpu)))
                                mask |= bit;
                }
                if (mask != 0) {
@@ -1388,6 +1426,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        unsigned long flags;
        struct rcu_data *rdp;
+        debug_rcu_head_queue(head);
        head->func = func;
        head->next = NULL;
@@ -1400,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * a quiescent state betweentimes.
         */
        local_irq_save(flags);
-        rdp = rsp->rda[smp_processor_id()];
+        rdp = this_cpu_ptr(rsp->rda);
        rcu_process_gp_end(rsp, rdp);
        check_for_new_grace_period(rsp, rdp);
@@ -1699,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        int i;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
@@ -1727,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 {
        unsigned long flags;
        unsigned long mask;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
@@ -1863,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 /*
 * Helper function for rcu_init() that initializes one rcu_state structure.
 */
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+                struct rcu_data __percpu *rda)
 {
        static char *buf[] = { "rcu_node_level_0",
                               "rcu_node_level_1",
@@ -1916,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                }
        }
+        rsp->rda = rda;
        rnp = rsp->level[NUM_RCU_LVLS - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
-                rsp->rda[i]->mynode = rnp;
+                per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i, rsp);
        }
 }
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
- * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
-        int i; \
-        \
-        for_each_possible_cpu(i) { \
-                (rsp)->rda[i] = &per_cpu(rcu_data, i); \
-        } \
-        rcu_init_one(rsp); \
-} while (0)
 void __init rcu_init(void)
 {
        int cpu;
        rcu_bootup_announce();
-        RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
+        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
-        RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
        long            qlen;           /* # of queued callbacks */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
+        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
+        unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
 #define RCU_STALL_DELAY_DELTA          0
 #endif
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+                                        RCU_STALL_DELAY_DELTA)
                                                /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
                                                /* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
                                                /*  before ratting on them. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
-#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
-        struct rcu_data *rda[NR_CPUS];          /* array of rdp pointers. */
+        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
        printk(KERN_INFO
               "\tRCU-based detection of stalled CPUs is disabled.\n");
 #endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
-                rdp = rcu_preempt_state.rda[cpu];
+                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
 */
 void __rcu_read_lock(void)
 {
-        ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+        current->rcu_read_lock_nesting++;
        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
        struct task_struct *t = current;
        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-        if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+        --t->rcu_read_lock_nesting;
+        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+        if (t->rcu_read_lock_nesting == 0 &&
            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                rcu_read_unlock_special(t);
 #ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
        }
 }
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
 *
 * Control will return to the caller some time after a full grace
 * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
+ * read-side critical sections have completed.  Note, however, that
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * upon return from synchronize_rcu(), the caller might well be executing
- * and may be nested.
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.  RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 */
 void synchronize_rcu(void)
 {
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
 */
 static void __init __rcu_init_preempt(void)
 {
-        RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
 /*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
 }
 /*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-        call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-/*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
 * But because preemptable RCU does not exist, map to rcu-sched.
 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+        seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 #define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+        seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+        seq_printf(m, ",%lu,%lu,%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 #ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
        seq_puts(m, "\"rcu_preempt:\"\n");
        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_data *rdp;
        for_each_possible_cpu(cpu) {
-                rdp = rsp->rda[cpu];
+                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (rdp->beenonline)
                        print_one_rcu_pending(m, rdp);
        }
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
 static DEFINE_RWLOCK(resource_lock);
+/*
+ * By default, we allocate free space bottom-up.  The architecture can request
+ * top-down by clearing this flag.  The user can override the architecture's
+ * choice with the "resource_alloc_from_bottom" kernel boot option, but that
+ * should only be a debugging tool.
+ */
+int resource_alloc_from_bottom = 1;
+static __init int setup_alloc_from_bottom(char *s)
+{
+        printk(KERN_INFO
+               "resource: allocating from bottom-up; please report a bug\n");
+        resource_alloc_from_bottom = 1;
+        return 0;
+}
+early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+static resource_size_t simple_align_resource(void *data,
+                                             const struct resource *avail,
+                                             resource_size_t size,
+                                             resource_size_t align)
+{
+        return avail->start;
+}
+static void resource_clip(struct resource *res, resource_size_t min,
+                          resource_size_t max)
+{
+        if (res->start < min)
+                res->start = min;
+        if (res->end > max)
+                res->end = max;
+}
+static bool resource_contains(struct resource *res1, struct resource *res2)
+{
+        return res1->start <= res2->start && res1->end >= res2->end;
+}
+/*
+ * Find the resource before "child" in the sibling list of "root" children.
+ */
+static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
+{
+        struct resource *this;
+        for (this = root->child; this; this = this->sibling)
+                if (this->sibling == child)
+                        return this;
+        return NULL;
+}
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the end of the root resource first.
+ */
+static int find_resource_from_top(struct resource *root, struct resource *new,
+                                  resource_size_t size, resource_size_t min,
+                                  resource_size_t max, resource_size_t align,
+                                  resource_size_t (*alignf)(void *,
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
+                                  void *alignf_data)
+{
+        struct resource *this;
+        struct resource tmp, avail, alloc;
+        tmp.start = root->end;
+        tmp.end = root->end;
+        this = find_sibling_prev(root, NULL);
+        for (;;) {
+                if (this) {
+                        if (this->end < root->end)
+                                tmp.start = this->end + 1;
+                } else
+                        tmp.start = root->start;
+                resource_clip(&tmp, min, max);
+                /* Check for overflow after ALIGN() */
+                avail = *new;
+                avail.start = ALIGN(tmp.start, align);
+                avail.end = tmp.end;
+                if (avail.start >= tmp.start) {
+                        alloc.start = alignf(alignf_data, &avail, size, align);
+                        alloc.end = alloc.start + size - 1;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
+                }
+                if (!this || this->start == root->start)
+                        break;
+                tmp.end = this->start - 1;
+                this = find_sibling_prev(root, this);
+        }
+        return -EBUSY;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the beginning of the root resource first.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
-        struct resource tmp = *new;
+        struct resource tmp = *new, avail, alloc;
        tmp.start = root->start;
        /*
-         * Skip past an allocated resource that starts at 0, since the assignment
+         * Skip past an allocated resource that starts at 0, since the
-         * of this->start - 1 to tmp->end below would cause an underflow.
+         * assignment of this->start - 1 to tmp->end below would cause an
+         * underflow.
         */
        if (this && this->start == 0) {
                tmp.start = this->end + 1;
                this = this->sibling;
        }
-        for(;;) {
+        for (;;) {
                if (this)
                        tmp.end = this->start - 1;
                else
                        tmp.end = root->end;
-                if (tmp.start < min)
-                        tmp.start = min;
+                resource_clip(&tmp, min, max);
-                if (tmp.end > max)
-                        tmp.end = max;
+                /* Check for overflow after ALIGN() */
-                tmp.start = ALIGN(tmp.start, align);
+                avail = *new;
-                if (alignf)
+                avail.start = ALIGN(tmp.start, align);
-                        tmp.start = alignf(alignf_data, &tmp, size, align);
+                avail.end = tmp.end;
-                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
+                if (avail.start >= tmp.start) {
-                        new->start = tmp.start;
+                        alloc.start = alignf(alignf_data, &avail, size, align);
-                        new->end = tmp.start + size - 1;
+                        alloc.end = alloc.start + size - 1;
-                        return 0;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
                }
                if (!this)
                        break;
                tmp.start = this->end + 1;
                this = this->sibling;
        }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
 {
        int err;
+        if (!alignf)
+                alignf = simple_align_resource;
        write_lock(&resource_lock);
-        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        if (resource_alloc_from_bottom)
+                err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        else
+                err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
                if (first == parent)
                        return first;
+                if (WARN_ON(first == new))      /* duplicated insertion */
+                        return first;
                if ((first->start > new->start) || (first->end < new->end))
                        break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..66cb89bc5ef1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
                }
                if (!lockwakeup && td->bkl == 4) {
+#ifdef CONFIG_LOCK_KERNEL
                        unlock_kernel();
+#endif
                        td->bkl = 0;
                }
                return 0;
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
                if (td->bkl)
                        return 0;
                td->bkl = 1;
+#ifdef CONFIG_LOCK_KERNEL
                lock_kernel();
+#endif
                td->bkl = 4;
                return 0;
        case RTTEST_UNLOCKBKL:
                if (td->bkl != 4)
                        break;
+#ifdef CONFIG_LOCK_KERNEL
                unlock_kernel();
+#endif
                td->bkl = 0;
                return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
 #include <asm/irq_regs.h>
 #include "sched_cpupri.h"
+#include "workqueue_sched.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -425,9 +426,7 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        atomic_t rto_count;
-#ifdef CONFIG_SMP
        struct cpupri cpupri;
-#endif
 };
 /*
@@ -436,7 +435,7 @@ struct root_domain {
 */
 static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
 /*
 * This is the main, per-CPU runqueue data structure.
@@ -456,9 +455,10 @@ struct rq {
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+        unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ
        u64 nohz_stamp;
-        unsigned char in_nohz_recently;
+        unsigned char nohz_balance_kick;
 #endif
        unsigned int skip_clock_update;
@@ -486,11 +486,12 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle;
+        struct task_struct *curr, *idle, *stop;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
        u64 clock;
+        u64 clock_task;
        atomic_t nr_iowait;
@@ -518,6 +519,10 @@ struct rq {
        u64 avg_idle;
 #endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
        long calc_load_active;
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
 inline void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update)
+        if (!rq->skip_clock_update) {
-                rq->clock = sched_clock_cpu(cpu_of(rq));
+                int cpu = cpu_of(rq);
+                u64 irq_time;
+                rq->clock = sched_clock_cpu(cpu);
+                irq_time = irq_time_cpu(cpu);
+                if (rq->clock - irq_time > rq->clock_task)
+                        rq->clock_task = rq->clock - irq_time;
+                sched_irq_time_avg_update(rq, irq_time);
+        }
 }
 /*
@@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int i;
@@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
        if (strncmp(buf, "NO_", 3) == 0) {
                neg = 1;
@@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        }
        for (i = 0; sched_feat_names[i]; i++) {
-                int len = strlen(sched_feat_names[i]);
+                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                if (strncmp(cmp, sched_feat_names[i], len) == 0) {
                        if (neg)
                                sysctl_sched_features &= ~(1UL << i);
                        else
@@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu)
 #ifdef CONFIG_NO_HZ
 /*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+        int cpu = smp_processor_id();
+        int i;
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd) {
+                for_each_cpu(i, sched_domain_span(sd))
+                        if (!idle_cpu(i))
+                                return i;
+        }
+        return cpu;
+}
+/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu)
                smp_send_reschedule(cpu);
 }
-int nohz_ratelimit(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 diff = rq->clock - rq->nohz_stamp;
-        rq->nohz_stamp = rq->clock;
-        return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
@@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p)
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 }
+static void sched_avg_update(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
@@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd)
        if (root_task_group_empty())
                return;
-        now = cpu_clock(raw_smp_processor_id());
+        now = local_clock();
        elapsed = now - sd->last_update;
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
@@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
-        if (task_has_rt_policy(p)) {
-                p->se.load.weight = 0;
-                p->se.load.inv_weight = WMULT_CONST;
-                return;
-        }
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
@@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
        dec_nr_running(rq);
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+void enable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 1;
+}
+void disable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 0;
+}
+static u64 irq_time_cpu(int cpu)
+{
+        if (!sched_clock_irqtime)
+                return 0;
+        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+void account_system_vtime(struct task_struct *curr)
+{
+        unsigned long flags;
+        int cpu;
+        u64 now, delta;
+        if (!sched_clock_irqtime)
+                return;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        now = sched_clock_cpu(cpu);
+        delta = now - per_cpu(irq_start_time, cpu);
+        per_cpu(irq_start_time, cpu) = now;
+        /*
+         * We do not account for softirq time from ksoftirqd here.
+         * We want to continue accounting softirq time to ksoftirqd thread
+         * in that case, so as not to confuse scheduler with a special task
+         * that do not consume any time, but still wants to run.
+         */
+        if (hardirq_count())
+                per_cpu(cpu_hardirq_time, cpu) += delta;
+        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+                per_cpu(cpu_softirq_time, cpu) += delta;
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+                rq->prev_irq_time = curr_irq_time;
+                sched_rt_avg_update(rq, delta_irq);
+        }
+}
+#else
+static u64 irq_time_cpu(int cpu)
+{
+        return 0;
+}
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        struct task_struct *old_stop = cpu_rq(cpu)->stop;
+        if (stop) {
+                /*
+                 * Make it appear like a SCHED_FIFO task, its something
+                 * userspace knows about and won't get confused about.
+                 *
+                 * Also, it will make PI more or less work without too
+                 * much confusion -- but then, stop work should not
+                 * rely on PI working anyway.
+                 */
+                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+                stop->sched_class = &stop_sched_class;
+        }
+        cpu_rq(cpu)->stop = stop;
+        if (old_stop) {
+                /*
+                 * Reset it back to a normal scheduling class so that
+                 * it can die in pieces.
+                 */
+                old_stop->sched_class = &rt_sched_class;
+        }
+}
 /*
 * __normal_prio - return the priority that is based on the static prio
 */
@@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        if (p->sched_class != &fair_sched_class)
                return 0;
+        if (unlikely(p->policy == SCHED_IDLE))
+                return 0;
        /*
         * Buddy candidates are cache hot:
         */
@@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+                                 bool is_sync, bool is_migrate, bool is_local,
+                                 unsigned long en_flags)
+{
+        schedstat_inc(p, se.statistics.nr_wakeups);
+        if (is_sync)
+                schedstat_inc(p, se.statistics.nr_wakeups_sync);
+        if (is_migrate)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+        if (is_local)
+                schedstat_inc(p, se.statistics.nr_wakeups_local);
+        else
+                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+        activate_task(rq, p, en_flags);
+}
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+                                        int wake_flags, bool success)
+{
+        trace_sched_wakeup(p, success);
+        check_preempt_curr(rq, p, wake_flags);
+        p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+        if (p->sched_class->task_woken)
+                p->sched_class->task_woken(rq, p);
+        if (unlikely(rq->idle_stamp)) {
+                u64 delta = rq->clock - rq->idle_stamp;
+                u64 max = 2*sysctl_sched_migration_cost;
+                if (delta > max)
+                        rq->avg_idle = max;
+                else
+                        update_avg(&rq->avg_idle, delta);
+                rq->idle_stamp = 0;
+        }
+#endif
+        /* if a worker is waking up, notify workqueue */
+        if ((p->flags & PF_WQ_WORKER) && success)
+                wq_worker_waking_up(p, cpu_of(rq));
+}
+/**
 * try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
 * @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample)
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
 */
 static int try_to_wake_up(struct task_struct *p, unsigned int state,
                          int wake_flags)
@@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 out_activate:
 #endif /* CONFIG_SMP */
-        schedstat_inc(p, se.statistics.nr_wakeups);
+        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-        if (wake_flags & WF_SYNC)
+                      cpu == this_cpu, en_flags);
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (orig_cpu != cpu)
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-        if (cpu == this_cpu)
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        else
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
-        activate_task(rq, p, en_flags);
        success = 1;
 out_running:
-        trace_sched_wakeup(p, success);
+        ttwu_post_activation(p, rq, wake_flags, success);
-        check_preempt_curr(rq, p, wake_flags);
-        p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
-        if (p->sched_class->task_woken)
-                p->sched_class->task_woken(rq, p);
-        if (unlikely(rq->idle_stamp)) {
-                u64 delta = rq->clock - rq->idle_stamp;
-                u64 max = 2*sysctl_sched_migration_cost;
-                if (delta > max)
-                        rq->avg_idle = max;
-                else
-                        update_avg(&rq->avg_idle, delta);
-                rq->idle_stamp = 0;
-        }
-#endif
 out:
        task_rq_unlock(rq, &flags);
        put_cpu();
@@ -2399,6 +2565,37 @@ out:
 }
 /**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there.  The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.  this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+        struct rq *rq = task_rq(p);
+        bool success = false;
+        BUG_ON(rq != this_rq());
+        BUG_ON(p == current);
+        lockdep_assert_held(&rq->lock);
+        if (!(p->state & TASK_NORMAL))
+                return;
+        if (!p->se.on_rq) {
+                if (likely(!task_running(rq, p))) {
+                        schedstat_inc(rq, ttwu_count);
+                        schedstat_inc(rq, ttwu_local);
+                }
+                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+                success = true;
+        }
+        ttwu_post_activation(p, rq, 0, success);
+}
+/**
 * wake_up_process - Wake up a specific process
 * @p: The process to be woken up.
 *
@@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
-        if (likely(!mm)) {
+        if (!mm) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (likely(!prev->mm)) {
+        if (!prev->mm) {
                prev->active_mm = NULL;
                rq->prev_mm = oldmm;
        }
@@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT           7
+static const unsigned char
+                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                        {0, 0, 0, 0, 0, 0, 0, 0},
+                                        {64, 32, 8, 0, 0, 0, 0, 0},
+                                        {96, 72, 40, 12, 1, 0, 0},
+                                        {112, 98, 75, 43, 15, 1, 0},
+                                        {120, 112, 98, 76, 45, 16, 2} };
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+        int j = 0;
+        if (!missed_updates)
+                return load;
+        if (missed_updates >= degrade_zero_ticks[idx])
+                return 0;
+        if (idx == 1)
+                return load >> missed_updates;
+        while (missed_updates) {
+                if (missed_updates % 2)
+                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+                missed_updates >>= 1;
+                j++;
+        }
+        return load;
+}
+/*
 * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
 */
 static void update_cpu_load(struct rq *this_rq)
 {
        unsigned long this_load = this_rq->load.weight;
+        unsigned long curr_jiffies = jiffies;
+        unsigned long pending_updates;
        int i, scale;
        this_rq->nr_load_updates++;
+        /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
        /* Update our load: */
-        for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
                unsigned long old_load, new_load;
                /* scale is effectively 1 << i now, and >> i divides by scale */
                old_load = this_rq->cpu_load[i];
+                old_load = decay_load_missed(old_load, pending_updates - 1, i);
                new_load = this_load;
                /*
                 * Round up the averaging division if load is increasing. This
@@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq)
                 * example.
                 */
                if (new_load > old_load)
-                        new_load += scale-1;
+                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
        }
+        sched_avg_update(this_rq);
+}
+static void update_cpu_load_active(struct rq *this_rq)
+{
+        update_cpu_load(this_rq);
        calc_load_account_active(this_rq);
 }
@@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
        if (task_current(rq, p)) {
                update_rq_clock(rq);
-                ns = rq->clock - p->se.exec_start;
+                ns = rq->clock_task - p->se.exec_start;
                if ((s64)ns < 0)
                        ns = 0;
        }
@@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
-        else if (softirq_count())
+        else if (in_serving_softirq())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
        if (total) {
-                u64 temp;
+                u64 temp = rtime;
-                temp = (u64)(rtime * utime);
+                temp *= utime;
                do_div(temp, total);
                utime = (cputime_t)temp;
        } else
@@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
        if (total) {
-                u64 temp;
+                u64 temp = rtime;
-                temp = (u64)(rtime * cputime.utime);
+                temp *= cputime.utime;
                do_div(temp, total);
                utime = (cputime_t)temp;
        } else
@@ -3426,11 +3710,11 @@ void scheduler_tick(void)
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
-        update_cpu_load(rq);
+        update_cpu_load_active(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick(curr);
+        perf_event_task_tick();
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq)
                        return p;
        }
-        class = sched_class_highest;
+        for_each_class(class) {
-        for ( ; ; ) {
                p = class->pick_next_task(rq);
                if (p)
                        return p;
-                /*
-                 * Will never be NULL as the idle class always
-                 * returns a non-NULL p:
-                 */
-                class = class->next;
        }
+        BUG(); /* the idle class will always have a runnable task */
 }
 /*
@@ -3598,7 +3878,6 @@ need_resched:
        rq = cpu_rq(cpu);
        rcu_note_context_switch(cpu);
        prev = rq->curr;
-        switch_count = &prev->nivcsw;
        release_kernel_lock(prev);
 need_resched_nonpreemptible:
@@ -3611,11 +3890,26 @@ need_resched_nonpreemptible:
        raw_spin_lock_irq(&rq->lock);
        clear_tsk_need_resched(prev);
+        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-                if (unlikely(signal_pending_state(prev->state, prev)))
+                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
-                else
+                } else {
+                        /*
+                         * If a worker is going to sleep, notify and
+                         * ask workqueue whether it wants to wake up a
+                         * task to maintain concurrency.  If so, wake
+                         * up the task.
+                         */
+                        if (prev->flags & PF_WQ_WORKER) {
+                                struct task_struct *to_wakeup;
+                                to_wakeup = wq_worker_sleeping(prev, cpu);
+                                if (to_wakeup)
+                                        try_to_wake_up_local(to_wakeup);
+                        }
                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                }
                switch_count = &prev->nvcsw;
        }
@@ -3637,8 +3931,10 @@ need_resched_nonpreemptible:
                context_switch(rq, prev, next); /* unlocks the rq */
                /*
-                 * the context switch might have flipped the stack from under
+                 * The context switch have flipped the stack from under us
-                 * us, hence refresh the local variables.
+                 * and restored the local variables which were saved when
+                 * this task called schedule() in the past. prev == current
+                 * is still correct, but it can be moved to another cpu/rq.
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
@@ -3647,11 +3943,8 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+        if (unlikely(reacquire_kernel_lock(prev)))
-                prev = rq->curr;
-                switch_count = &prev->nivcsw;
                goto need_resched_nonpreemptible;
-        }
        preempt_enable_no_resched();
        if (need_resched())
@@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                /*
                 * Owner changed, break to re-assess state.
                 */
-                if (lock->owner != owner)
+                if (lock->owner != owner) {
+                        /*
+                         * If the lock has switched to a different owner,
+                         * we likely have heavy contention. Return 0 to quit
+                         * optimistic spinning and not contend further:
+                         */
+                        if (lock->owner)
+                                return 0;
                        break;
+                }
                /*
                 * Is that owner really running on that cpu?
@@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 * off of preempt_enable. Kernel preemptions off return from interrupt
 * occur there and call schedule directly.
 */
-asmlinkage void __sched preempt_schedule(void)
+asmlinkage void __sched notrace preempt_schedule(void)
 {
        struct thread_info *ti = current_thread_info();
@@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void)
                return;
        do {
-                add_preempt_count(PREEMPT_ACTIVE);
+                add_preempt_count_notrace(PREEMPT_ACTIVE);
                schedule();
-                sub_preempt_count(PREEMPT_ACTIVE);
+                sub_preempt_count_notrace(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        rq = task_rq_lock(p, &flags);
+        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
@@ -4441,12 +4743,8 @@ recheck:
         */
        if (user && !capable(CAP_SYS_NICE)) {
                if (rt_policy(policy)) {
-                        unsigned long rlim_rtprio;
+                        unsigned long rlim_rtprio =
+                                        task_rlimit(p, RLIMIT_RTPRIO);
-                        if (!lock_task_sighand(p, &flags))
-                                return -ESRCH;
-                        rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-                        unlock_task_sighand(p, &flags);
                        /* can't set/change the rt policy */
                        if (policy != p->policy && !rlim_rtprio)
@@ -4474,7 +4772,7 @@ recheck:
        }
        if (user) {
-                retval = security_task_setscheduler(p, policy, param);
+                retval = security_task_setscheduler(p);
                if (retval)
                        return retval;
        }
@@ -4490,6 +4788,15 @@ recheck:
         */
        rq = __task_rq_lock(p);
+        /*
+         * Changing the policy of the stop threads its a very bad idea
+         */
+        if (p == rq->stop) {
+                __task_rq_unlock(rq);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                return -EINVAL;
+        }
 #ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
                /*
@@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
-        retval = security_task_setscheduler(p, 0, NULL);
+        retval = security_task_setscheduler(p);
        if (retval)
                goto out_unlock;
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
        retval = set_cpus_allowed_ptr(p, new_mask);
        if (!retval) {
@@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->se.exec_start = sched_clock();
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+        /*
+         * We're having a chicken and egg problem, even though we are
+         * holding rq->lock, the cpu isn't yet set to this cpu so the
+         * lockdep check in task_group() will fail.
+         *
+         * Similar case to sched_fork(). / Alternatively we could
+         * use task_rq_lock() here and obtain the other rq->lock.
+         *
+         * Silence PROVE_RCU
+         */
+        rcu_read_lock();
        __set_task_cpu(idle, cpu);
+        rcu_read_unlock();
        rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 */
 static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
-        .priority = 10
+        .priority = CPU_PRI_MIGRATION,
 };
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+                                      unsigned long action, void *hcpu)
+{
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                set_cpu_active((long)hcpu, true);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+                                        unsigned long action, void *hcpu)
+{
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                set_cpu_active((long)hcpu, false);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
 static int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
-        /* Start one for the boot CPU: */
+        /* Initialize migration for the boot CPU */
        err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
        BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
+        /* Register cpu active notifiers */
+        cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+        cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
        return 0;
 }
 early_initcall(migration_init);
@@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                free_rootdomain(old_rd);
 }
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
 {
-        gfp_t gfp = GFP_KERNEL;
        memset(rd, 0, sizeof(*rd));
-        if (bootmem)
+        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-                gfp = GFP_NOWAIT;
-        if (!alloc_cpumask_var(&rd->span, gfp))
                goto out;
-        if (!alloc_cpumask_var(&rd->online, gfp))
+        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                goto free_online;
-        if (cpupri_init(&rd->cpupri, bootmem) != 0)
+        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
        return 0;
@@ -6096,7 +6439,7 @@ out:
 static void init_defrootdomain(void)
 {
-        init_rootdomain(&def_root_domain, true);
+        init_rootdomain(&def_root_domain);
        atomic_set(&def_root_domain.refcount, 1);
 }
@@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void)
        if (!rd)
                return NULL;
-        if (init_rootdomain(rd, false) != 0) {
+        if (init_rootdomain(rd) != 0) {
                kfree(rd);
                return NULL;
        }
@@ -6319,6 +6662,7 @@ struct s_data {
        cpumask_var_t           nodemask;
        cpumask_var_t           this_sibling_map;
        cpumask_var_t           this_core_map;
+        cpumask_var_t           this_book_map;
        cpumask_var_t           send_covered;
        cpumask_var_t           tmpmask;
        struct sched_group      **sched_group_nodes;
@@ -6330,6 +6674,7 @@ enum s_alloc {
        sa_rootdomain,
        sa_tmpmask,
        sa_send_covered,
+        sa_this_book_map,
        sa_this_core_map,
        sa_this_sibling_map,
        sa_nodemask,
@@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *mask)
 {
        int group;
+#ifdef CONFIG_SCHED_SMT
        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
+#else
+        group = cpu;
+#endif
        if (sg)
                *sg = &per_cpu(sched_group_core, group).sg;
        return group;
 }
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
-                  struct sched_group **sg, struct cpumask *unused)
+                  struct sched_group **sg, struct cpumask *mask)
 {
+        int group = cpu;
+#ifdef CONFIG_SCHED_MC
+        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#endif
        if (sg)
-                *sg = &per_cpu(sched_group_core, cpu).sg;
+                *sg = &per_cpu(sched_group_book, group).sg;
-        return cpu;
+        return group;
 }
-#endif
+#endif /* CONFIG_SCHED_BOOK */
 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *mask)
 {
        int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+        cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
@@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_MC
 SD_INIT_FUNC(MC)
 #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
 static int default_relax_domain_level = -1;
@@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                free_cpumask_var(d->tmpmask); /* fall through */
        case sa_send_covered:
                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_this_book_map:
+                free_cpumask_var(d->this_book_map); /* fall through */
        case sa_this_core_map:
                free_cpumask_var(d->this_core_map); /* fall through */
        case sa_this_sibling_map:
@@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                return sa_nodemask;
        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                return sa_this_sibling_map;
-        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+        if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                return sa_this_core_map;
+        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+                return sa_this_book_map;
        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                return sa_send_covered;
        d->rd = alloc_rootdomain();
@@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
        return sd;
 }
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        struct sched_domain *parent, int i)
+{
+        struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+        sd = &per_cpu(book_domains, i).sd;
+        SD_INIT(sd, BOOK);
+        set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+        sd->parent = parent;
+        parent->child = sd;
+        cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+        return sd;
+}
 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
        struct sched_domain *parent, int i)
@@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
                                                d->send_covered, d->tmpmask);
                break;
 #endif
+#ifdef CONFIG_SCHED_BOOK
+        case SD_LV_BOOK: /* set up book groups */
+                cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+                if (cpu == cpumask_first(d->this_book_map))
+                        init_sched_build_groups(d->this_book_map, cpu_map,
+                                                &cpu_to_book_group,
+                                                d->send_covered, d->tmpmask);
+                break;
+#endif
        case SD_LV_CPU: /* set up physical groups */
                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
                if (!cpumask_empty(d->nodemask))
@@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+                sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
        }
        for_each_cpu(i, cpu_map) {
                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+                build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
        }
@@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                init_sched_groups_power(i, sd);
        }
 #endif
+#ifdef CONFIG_SCHED_BOOK
+        for_each_cpu(i, cpu_map) {
+                sd = &per_cpu(book_domains, i).sd;
+                init_sched_groups_power(i, sd);
+        }
+#endif
        for_each_cpu(i, cpu_map) {
                sd = &per_cpu(phys_domains, i).sd;
@@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
                sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+                sd = &per_cpu(book_domains, i).sd;
 #else
                sd = &per_cpu(phys_domains, i).sd;
 #endif
@@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
+ * Update cpusets according to cpu_active mask.  If cpusets are
- * When cpusets are enabled they take over this function.
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
 */
-static int update_sched_domains(struct notifier_block *nfb,
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-                                unsigned long action, void *hcpu)
+                             void *hcpu)
 {
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
+                cpuset_update_active_cpus();
-                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+                               void *hcpu)
+{
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                cpuset_update_active_cpus();
+                return NOTIFY_OK;
        default:
                return NOTIFY_DONE;
        }
 }
-#endif
 static int update_runtime(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
@@ -7356,10 +7770,8 @@ void __init sched_init_smp(void)
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
-#ifndef CONFIG_CPUSETS
+        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-        /* XXX: Theoretical race here - CPU may be hotplugged now */
+        hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-        hotcpu_notifier(update_sched_domains, 0);
-#endif
        /* RT runtime code needs to handle some hotplug events */
        hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +8016,9 @@ void __init sched_init(void)
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                        rq->cpu_load[j] = 0;
+                rq->last_load_update_tick = jiffies;
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
@@ -7617,6 +8032,10 @@ void __init sched_init(void)
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
                rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+                rq->nohz_balance_kick = 0;
+                init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
 #endif
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +8080,11 @@ void __init sched_init(void)
        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-        zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-        alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+        atomic_set(&nohz.load_balancer, nr_cpu_ids);
+        atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+        atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
 #endif
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
@@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(cfs_rq);
- err:
+err:
        return 0;
 }
@@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(rt_rq);
- err:
+err:
        return 0;
 }
@@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
 *   Ingo Molnar <mingo@redhat.com>
 *   Guillaume Chazarain <guichaz@gmail.com>
 *
- * Create a semi stable clock from a mixture of other events, including:
+ *
- *  - gtod
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i)       -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock()      -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ *  - GTOD (clock monotomic)
 *  - sched_clock()
 *  - explicit idle events
 *
- * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
- * making it monotonic and keeping it within an expected window.
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
 *
 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
 * that is otherwise invisible (TSC gets stopped).
 *
- * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ *
- * consistent between cpus (never more than 2 jiffies difference).
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
 */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
        return val;
 }
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
 u64 sched_clock_cpu(int cpu)
 {
        struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-unsigned long long cpu_clock(int cpu)
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
 {
-        unsigned long long clock;
+        u64 clock;
        unsigned long flags;
        local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
        return clock;
 }
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+        u64 clock;
+        unsigned long flags;
+        local_irq_save(flags);
+        clock = sched_clock_cpu(smp_processor_id());
+        local_irq_restore(flags);
+        return clock;
+}
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
        return sched_clock();
 }
+u64 cpu_clock(int cpu)
-unsigned long long cpu_clock(int cpu)
 {
        return sched_clock_cpu(cpu);
 }
+u64 local_clock(void)
+{
+        return sched_clock_cpu(0);
+}
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 *
 * Returns: -ENOMEM if memory fails.
 */
-int cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp)
 {
-        gfp_t gfp = GFP_KERNEL;
        int i;
-        if (bootmem)
-                gfp = GFP_NOWAIT;
        memset(cp, 0, sizeof(*cp));
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
                raw_spin_lock_init(&vec->lock);
                vec->count = 0;
-                if (!zalloc_cpumask_var(&vec->mask, gfp))
+                if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
 int  cpupri_find(struct cpupri *cp,
                 struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp, bool bootmem);
+int cpupri_init(struct cpupri *cp);
 void cpupri_cleanup(struct cpupri *cp);
 #else
 #define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
        PN(sysctl_sched_latency);
        PN(sysctl_sched_min_granularity);
        PN(sysctl_sched_wakeup_granularity);
-        PN(sysctl_sched_child_runs_first);
+        P(sysctl_sched_child_runs_first);
        P(sysctl_sched_features);
 #undef PN
 #undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
 /*
 * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 /*
 * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
-static unsigned int sched_nr_latency = 3;
+static unsigned int sched_nr_latency = 8;
 /*
 * After fork, child runs first. If set to 0 (default) then
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
-        u64 now = rq_of(cfs_rq)->clock;
+        u64 now = rq_of(cfs_rq)->clock_task;
        unsigned long delta_exec;
        if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-        se->exec_start = rq_of(cfs_rq)->clock;
+        se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 /**************************************************
@@ -1313,7 +1313,7 @@ static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                  int this_cpu, int load_idx)
 {
-        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+        struct sched_group *idlest = NULL, *group = sd->groups;
        unsigned long min_load = ULONG_MAX, this_load = 0;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                if (local_group) {
                        this_load = avg_load;
-                        this = group;
                } else if (avg_load < min_load) {
                        min_load = avg_load;
                        idlest = group;
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
        check_preempt_curr(this_rq, p, 0);
+        /* re-arm NEWIDLE balancing when moving tasks */
+        src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+        this_rq->idle_stamp = 0;
 }
 /*
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
+        tsk_cache_hot = task_hot(p, rq->clock_task, sd);
        if (!tsk_cache_hot ||
                sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2034,14 @@ struct sd_lb_stats {
        unsigned long this_load;
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
+        unsigned long this_has_capacity;
        /* Statistics of the busiest group */
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
+        unsigned long busiest_has_capacity;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2059,6 +2064,7 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
        int group_imb; /* Is there an imbalance in the group ? */
+        int group_has_capacity; /* Is there extra capacity in the group? */
 };
 /**
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu)
        struct rq *rq = cpu_rq(cpu);
        u64 total, available;
-        sched_avg_update(rq);
        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
+        if (unlikely(total < rq->rt_avg)) {
+                /* Ensures that power won't end up being negative */
+                available = 0;
+        } else {
+                available = total - rq->rt_avg;
+        }
        if (unlikely((s64)total < SCHED_LOAD_SCALE))
                total = SCHED_LOAD_SCALE;
@@ -2287,13 +2297,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
-        if (sched_feat(ARCH_POWER))
-                power *= arch_scale_freq_power(sd, cpu);
-        else
-                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
                if (sched_feat(ARCH_POWER))
                        power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2306,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
                power >>= SCHED_LOAD_SHIFT;
        }
+        sdg->cpu_power_orig = power;
+        if (sched_feat(ARCH_POWER))
+                power *= arch_scale_freq_power(sd, cpu);
+        else
+                power *= default_scale_freq_power(sd, cpu);
+        power >>= SCHED_LOAD_SHIFT;
        power *= scale_rt_power(cpu);
        power >>= SCHED_LOAD_SHIFT;
@@ -2335,6 +2347,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
        sdg->cpu_power = power;
 }
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+        /*
+         * Only siblings can have significantly less than SCHED_LOAD_SCALE
+         */
+        if (sd->level != SD_LV_SIBLING)
+                return 0;
+        /*
+         * If ~90% of the cpu_power is still there, we're good.
+         */
+        if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+                return 1;
+        return 0;
+}
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @sd: The sched_domain whose statistics are to be updated.
@@ -2354,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
        int i;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
@@ -2365,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
+        max_nr_running = 0;
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
@@ -2382,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
+                        if (load > max_cpu_load) {
                                max_cpu_load = load;
+                                max_nr_running = rq->nr_running;
+                        }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
                }
@@ -2400,14 +2440,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         * domains. In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
-        if (idle != CPU_NEWLY_IDLE && local_group &&
+        if (idle != CPU_NEWLY_IDLE && local_group) {
-            balance_cpu != this_cpu) {
+                if (balance_cpu != this_cpu) {
-                *balance = 0;
+                        *balance = 0;
-                return;
+                        return;
+                }
+                update_group_power(sd, this_cpu);
        }
-        update_group_power(sd, this_cpu);
        /* Adjust by relative CPU power of the group */
        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
@@ -2423,11 +2463,58 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
-        sgs->group_capacity =
+        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+        if (!sgs->group_capacity)
+                sgs->group_capacity = fix_small_capacity(sd, group);
+        if (sgs->group_capacity > sgs->sum_nr_running)
+                sgs->group_has_capacity = 1;
+}
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+                                   struct sd_lb_stats *sds,
+                                   struct sched_group *sg,
+                                   struct sg_lb_stats *sgs,
+                                   int this_cpu)
+{
+        if (sgs->avg_load <= sds->max_load)
+                return false;
+        if (sgs->sum_nr_running > sgs->group_capacity)
+                return true;
+        if (sgs->group_imb)
+                return true;
+        /*
+         * ASYM_PACKING needs to move all the work to the lowest
+         * numbered CPUs in the group, therefore mark all groups
+         * higher than ourself as busy.
+         */
+        if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+            this_cpu < group_first_cpu(sg)) {
+                if (!sds->busiest)
+                        return true;
+                if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+                        return true;
+        }
+        return false;
 }
 /**
@@ -2435,7 +2522,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2533,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = sd->child;
-        struct sched_group *group = sd->groups;
+        struct sched_group *sg = sd->groups;
        struct sg_lb_stats sgs;
        int load_idx, prefer_sibling = 0;
@@ -2459,45 +2546,100 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
        do {
                int local_group;
-                local_group = cpumask_test_cpu(this_cpu,
+                local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
-                                               sched_group_cpus(group));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
                                local_group, cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
                sds->total_load += sgs.group_load;
-                sds->total_pwr += group->cpu_power;
+                sds->total_pwr += sg->cpu_power;
                /*
                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the group capacity to one so that we'll try
+                 * first, lower the sg capacity to one so that we'll try
-                 * and move all the excess tasks away.
+                 * and move all the excess tasks away. We lower the capacity
+                 * of a group only if the local group has the capacity to fit
+                 * these excess tasks, i.e. nr_running < group_capacity. The
+                 * extra check prevents the case where you always pull from the
+                 * heaviest group when it is already under-utilized (possible
+                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling)
+                if (prefer_sibling && !local_group && sds->this_has_capacity)
                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
                if (local_group) {
                        sds->this_load = sgs.avg_load;
-                        sds->this = group;
+                        sds->this = sg;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
-                } else if (sgs.avg_load > sds->max_load &&
+                        sds->this_has_capacity = sgs.group_has_capacity;
-                           (sgs.sum_nr_running > sgs.group_capacity ||
+                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
-                                sgs.group_imb)) {
                        sds->max_load = sgs.avg_load;
-                        sds->busiest = group;
+                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->busiest_has_capacity = sgs.group_has_capacity;
                        sds->group_imb = sgs.group_imb;
                }
-                update_sd_power_savings_stats(group, sds, local_group, &sgs);
+                update_sd_power_savings_stats(sg, sds, local_group, &sgs);
-                group = group->next;
+                sg = sg->next;
-        } while (group != sd->groups);
+        } while (sg != sd->groups);
+}
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *                      sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+                              struct sd_lb_stats *sds,
+                              int this_cpu, unsigned long *imbalance)
+{
+        int busiest_cpu;
+        if (!(sd->flags & SD_ASYM_PACKING))
+                return 0;
+        if (!sds->busiest)
+                return 0;
+        busiest_cpu = group_first_cpu(sds->busiest);
+        if (this_cpu > busiest_cpu)
+                return 0;
+        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+                                       SCHED_LOAD_SCALE);
+        return 1;
 }
 /**
@@ -2637,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                return fix_small_imbalance(sds, this_cpu, imbalance);
 }
 /******* find_busiest_group() helpers end here *********************/
 /**
@@ -2688,13 +2831,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * 4) This group is more busy than the avg busieness at this
         *    sched_domain.
         * 5) The imbalance is within the specified limit.
+         *
+         * Note: when doing newidle balance, if the local group has excess
+         * capacity (i.e. nr_running < group_capacity) and the busiest group
+         * does not have any capacity, we force a load balance to pull tasks
+         * to the local group. In this case, we skip past checks 3, 4 and 5.
         */
        if (!(*balance))
                goto ret;
+        if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+            check_asym_packing(sd, &sds, this_cpu, imbalance))
+                return sds.busiest;
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
+        /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                        !sds.busiest_has_capacity)
+                goto force_balance;
        if (sds.this_load >= sds.max_load)
                goto out_balanced;
@@ -2706,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                goto out_balanced;
+force_balance:
        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(&sds, this_cpu, imbalance);
        return sds.busiest;
@@ -2726,8 +2884,9 @@ ret:
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
-                   unsigned long imbalance, const struct cpumask *cpus)
+                   enum cpu_idle_type idle, unsigned long imbalance,
+                   const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -2738,6 +2897,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                unsigned long wl;
+                if (!capacity)
+                        capacity = fix_small_capacity(sd, group);
                if (!cpumask_test_cpu(i, cpus))
                        continue;
@@ -2777,9 +2939,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+                               int busiest_cpu, int this_cpu)
 {
        if (idle == CPU_NEWLY_IDLE) {
+                /*
+                 * ASYM_PACKING needs to force migrate tasks from busy but
+                 * higher numbered CPUs in order to pack all tasks in the
+                 * lowest numbered CPUs.
+                 */
+                if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+                        return 1;
                /*
                 * The only task running in a non-idle cpu can be moved to this
                 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +3026,7 @@ redo:
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group, idle, imbalance, cpus);
+        busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2896,9 +3068,17 @@ redo:
        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
+                /*
+                 * Increment the failure counter only on periodic balance.
+                 * We do not want newidle balance, which can be very
+                 * frequent, pollute the failure counter causing
+                 * excessive cache_hot migrations and active balances.
+                 */
+                if (idle != CPU_NEWLY_IDLE)
+                        sd->nr_balance_failed++;
-                if (need_active_balance(sd, sd_idle, idle)) {
+                if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+                                        this_cpu)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the active_load_balance_cpu_stop,
@@ -3017,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
+                if (pulled_task)
-                        this_rq->idle_stamp = 0;
                        break;
-                }
        }
        raw_spin_lock(&this_rq->lock);
@@ -3093,13 +3271,40 @@ out_unlock:
 }
 #ifdef CONFIG_NO_HZ
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+static void trigger_sched_softirq(void *data)
+{
+        raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+        csd->func = trigger_sched_softirq;
+        csd->info = NULL;
+        csd->flags = 0;
+        csd->priv = 0;
+}
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ *   entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ *   it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
 static struct {
        atomic_t load_balancer;
-        cpumask_var_t cpu_mask;
+        atomic_t first_pick_cpu;
-        cpumask_var_t ilb_grp_nohz_mask;
+        atomic_t second_pick_cpu;
-} nohz ____cacheline_aligned = {
+        cpumask_var_t idle_cpus_mask;
-        .load_balancer = ATOMIC_INIT(-1),
+        cpumask_var_t grp_idle_mask;
-};
+        unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
 int get_nohz_load_balancer(void)
 {
@@ -3153,17 +3358,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 */
 static inline int is_semi_idle_group(struct sched_group *ilb_group)
 {
-        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+        cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
                                        sched_group_cpus(ilb_group));
        /*
         * A sched_group is semi-idle when it has atleast one busy cpu
         * and atleast one idle cpu.
         */
-        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+        if (cpumask_empty(nohz.grp_idle_mask))
                return 0;
-        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+        if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
                return 0;
        return 1;
@@ -3196,7 +3401,7 @@ static int find_new_ilb(int cpu)
         * Optimize for the case when we have no idle CPUs or only one
         * idle CPU. Don't walk the sched_domain hierarchy in such cases
         */
-        if (cpumask_weight(nohz.cpu_mask) < 2)
+        if (cpumask_weight(nohz.idle_cpus_mask) < 2)
                goto out_done;
        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3409,7 @@ static int find_new_ilb(int cpu)
                do {
                        if (is_semi_idle_group(ilb_group))
-                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                                return cpumask_first(nohz.grp_idle_mask);
                        ilb_group = ilb_group->next;
@@ -3212,98 +3417,116 @@ static int find_new_ilb(int cpu)
        }
 out_done:
-        return cpumask_first(nohz.cpu_mask);
+        return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
-        return cpumask_first(nohz.cpu_mask);
+        return nr_cpu_ids;
 }
 #endif
 /*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+        int ilb_cpu;
+        nohz.next_balance++;
+        ilb_cpu = get_nohz_load_balancer();
+        if (ilb_cpu >= nr_cpu_ids) {
+                ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+                if (ilb_cpu >= nr_cpu_ids)
+                        return;
+        }
+        if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+                struct call_single_data *cp;
+                cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+                cp = &per_cpu(remote_sched_softirq_cb, cpu);
+                __smp_call_function_single(ilb_cpu, cp, 0);
+        }
+        return;
+}
+/*
 * This routine will try to nominate the ilb (idle load balancing)
 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
+ * load balancing on behalf of all those cpus.
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
 *
- * While stopping the tick, this cpu will become the ilb owner if there
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
- * is no other owner. And will be the owner till that cpu becomes busy
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * or if all cpus in the system stop their ticks at which point
+ * idle load balancing by kicking one of the idle CPUs.
- * there is no need for ilb owner.
 *
- * When the ilb owner becomes busy, it nominates another owner, during the
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * next busy scheduler_tick()
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
 */
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
 {
        int cpu = smp_processor_id();
        if (stop_tick) {
-                cpu_rq(cpu)->in_nohz_recently = 1;
                if (!cpu_active(cpu)) {
                        if (atomic_read(&nohz.load_balancer) != cpu)
-                                return 0;
+                                return;
                        /*
                         * If we are going offline and still the leader,
                         * give up!
                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+                                           nr_cpu_ids) != cpu)
                                BUG();
-                        return 0;
+                        return;
                }
-                cpumask_set_cpu(cpu, nohz.cpu_mask);
+                cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
-                /* time for ilb owner also to sleep */
+                if (atomic_read(&nohz.first_pick_cpu) == cpu)
-                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+                        atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
-                        if (atomic_read(&nohz.load_balancer) == cpu)
+                if (atomic_read(&nohz.second_pick_cpu) == cpu)
-                                atomic_set(&nohz.load_balancer, -1);
+                        atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-                        return 0;
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
+                if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
-                        /* make me the ilb owner */
-                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu) {
                        int new_ilb;
-                        if (!(sched_smt_power_savings ||
+                        /* make me the ilb owner */
-                                                sched_mc_power_savings))
+                        if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
-                                return 1;
+                                           cpu) != nr_cpu_ids)
+                                return;
                        /*
                         * Check to see if there is a more power-efficient
                         * ilb.
                         */
                        new_ilb = find_new_ilb(cpu);
                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, -1);
+                                atomic_set(&nohz.load_balancer, nr_cpu_ids);
                                resched_cpu(new_ilb);
-                                return 0;
+                                return;
                        }
-                        return 1;
+                        return;
                }
        } else {
-                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+                if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
-                        return 0;
+                        return;
-                cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
                if (atomic_read(&nohz.load_balancer) == cpu)
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+                                           nr_cpu_ids) != cpu)
                                BUG();
        }
-        return 0;
+        return;
 }
 #endif
@@ -3385,11 +3608,102 @@ out:
                rq->next_balance = next_balance;
 }
+#ifdef CONFIG_NO_HZ
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+        struct rq *this_rq = cpu_rq(this_cpu);
+        struct rq *rq;
+        int balance_cpu;
+        if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+                return;
+        for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+                if (balance_cpu == this_cpu)
+                        continue;
+                /*
+                 * If this cpu gets work to do, stop the load balancing
+                 * work being done for other cpus. Next load
+                 * balancing owner will pick it up.
+                 */
+                if (need_resched()) {
+                        this_rq->nohz_balance_kick = 0;
+                        break;
+                }
+                raw_spin_lock_irq(&this_rq->lock);
+                update_rq_clock(this_rq);
+                update_cpu_load(this_rq);
+                raw_spin_unlock_irq(&this_rq->lock);
+                rebalance_domains(balance_cpu, CPU_IDLE);
+                rq = cpu_rq(balance_cpu);
+                if (time_after(this_rq->next_balance, rq->next_balance))
+                        this_rq->next_balance = rq->next_balance;
+        }
+        nohz.next_balance = this_rq->next_balance;
+        this_rq->nohz_balance_kick = 0;
+}
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ *   idle load balancer when it has more than one process active. This
+ *   eliminates the need for idle load balancing altogether when we have
+ *   only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *   SMT or core siblings and can run better if they move to different
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ *   which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+        unsigned long now = jiffies;
+        int ret;
+        int first_pick_cpu, second_pick_cpu;
+        if (time_before(now, nohz.next_balance))
+                return 0;
+        if (rq->idle_at_tick)
+                return 0;
+        first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+        second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+        if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+            second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+                return 0;
+        ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+        if (ret == nr_cpu_ids || ret == cpu) {
+                atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+                if (rq->nr_running > 1)
+                        return 1;
+        } else {
+                ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+                if (ret == nr_cpu_ids || ret == cpu) {
+                        if (rq->nr_running)
+                                return 1;
+                }
+        }
+        return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
 static void run_rebalance_domains(struct softirq_action *h)
 {
        int this_cpu = smp_processor_id();
@@ -3399,37 +3713,12 @@ static void run_rebalance_domains(struct softirq_action *h)
        rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
        /*
-         * If this cpu is the owner for idle load balancing, then do the
+         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
         * stopped.
         */
-        if (this_rq->idle_at_tick &&
+        nohz_idle_balance(this_cpu, idle);
-            atomic_read(&nohz.load_balancer) == this_cpu) {
-                struct rq *rq;
-                int balance_cpu;
-                for_each_cpu(balance_cpu, nohz.cpu_mask) {
-                        if (balance_cpu == this_cpu)
-                                continue;
-                        /*
-                         * If this cpu gets work to do, stop the load balancing
-                         * work being done for other cpus. Next load
-                         * balancing owner will pick it up.
-                         */
-                        if (need_resched())
-                                break;
-                        rebalance_domains(balance_cpu, CPU_IDLE);
-                        rq = cpu_rq(balance_cpu);
-                        if (time_after(this_rq->next_balance, rq->next_balance))
-                                this_rq->next_balance = rq->next_balance;
-                }
-        }
-#endif
 }
 static inline int on_null_domain(int cpu)
@@ -3439,57 +3728,17 @@ static inline int on_null_domain(int cpu)
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
 */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
-#ifdef CONFIG_NO_HZ
-        /*
-         * If we were in the nohz mode recently and busy at the current
-         * scheduler tick, then check if we need to nominate new idle
-         * load balancer.
-         */
-        if (rq->in_nohz_recently && !rq->idle_at_tick) {
-                rq->in_nohz_recently = 0;
-                if (atomic_read(&nohz.load_balancer) == cpu) {
-                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                        atomic_set(&nohz.load_balancer, -1);
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        int ilb = find_new_ilb(cpu);
-                        if (ilb < nr_cpu_ids)
-                                resched_cpu(ilb);
-                }
-        }
-        /*
-         * If this cpu is idle and doing idle load balancing for all the
-         * cpus with ticks stopped, is it time for that to stop?
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-                resched_cpu(cpu);
-                return;
-        }
-        /*
-         * If this cpu is idle and the idle load balancing is done by
-         * someone else, then no need raise the SCHED_SOFTIRQ
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-            cpumask_test_cpu(cpu, nohz.cpu_mask))
-                return;
-#endif
        /* Don't need to rebalance while attached to NULL domain */
        if (time_after_eq(jiffies, rq->next_balance) &&
            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+        else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+                nohz_balancer_kick(cpu);
+#endif
 }
 static void rq_online_fair(struct rq *rq)
@@ -3542,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p)
        raw_spin_lock_irqsave(&rq->lock, flags);
-        if (unlikely(task_cpu(p) != this_cpu))
+        update_rq_clock(rq);
+        if (unlikely(task_cpu(p) != this_cpu)) {
+                rcu_read_lock();
                __set_task_cpu(p, this_cpu);
+                rcu_read_unlock();
+        }
        update_curr(cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
 * release the lock. Decreases scheduling overhead.
 */
 SCHED_FEAT(OWNER_SPIN, 1)
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
        if (!task_has_rt_policy(curr))
                return;
-        delta_exec = rq->clock - curr->se.exec_start;
+        delta_exec = rq->clock_task - curr->se.exec_start;
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock;
+        curr->se.exec_start = rq->clock_task;
        cpuacct_charge(curr, delta_exec);
        sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
         * runqueue. Otherwise simply start this RT task
         * on its current runqueue.
         *
-         * We want to avoid overloading runqueues. Even if
+         * We want to avoid overloading runqueues. If the woken
-         * the RT task is of higher priority than the current RT task.
+         * task is a higher priority, then it will stay on this CPU
-         * RT tasks behave differently than other tasks. If
+         * and the lower prio task should be moved to another CPU.
-         * one gets preempted, we try to push it off to another queue.
+         * Even though this will probably make the lower prio task
-         * So trying to keep a preempting RT task on the same
+         * lose its cache, we do not want to bounce a higher task
-         * cache hot CPU will force the running RT task to
+         * around just because it gave up its CPU, perhaps for a
-         * a cold CPU. So we waste all the cache for the lower
+         * lock?
-         * RT task in hopes of saving some of a RT task
+         *
-         * that is just being woken and probably will have
+         * For equal prio tasks, we just let the scheduler sort it out.
-         * cold cache anyway.
         */
        if (unlikely(rt_task(rq->curr)) &&
+            (rq->curr->rt.nr_cpus_allowed < 2 ||
+             rq->curr->prio < p->prio) &&
            (p->rt.nr_cpus_allowed > 1)) {
                int cpu = find_lowest_rq(p);
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        } while (rt_rq);
        p = rt_task_of(rt_se);
-        p->se.exec_start = rq->clock;
+        p->se.exec_start = rq->clock_task;
        return p;
 }
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
        for_each_leaf_rt_rq(rt_rq, rq) {
                array = &rt_rq->active;
                idx = sched_find_first_bit(array->bitmap);
- next_idx:
+next_idx:
                if (idx >= MAX_RT_PRIO)
                        continue;
                if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
        if (!next_task)
                return 0;
- retry:
+retry:
        if (unlikely(next_task == rq->curr)) {
                WARN_ON(1);
                return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
                         * but possible)
                         */
                }
- skip:
+skip:
                double_unlock_balance(this_rq, src_rq);
        }
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1)
+            p->rt.nr_cpus_allowed > 1 &&
+            rt_task(rq->curr) &&
+            (rq->curr->rt.nr_cpus_allowed < 2 ||
+             rq->curr->prio < p->prio))
                push_rt_tasks(rq);
 }
@@ -1663,9 +1667,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 {
        unsigned long soft, hard;
-        if (!p->signal)
-                return;
        /* max may change after cur was read, this will be fixed next tick */
        soft = task_rlimit(p, RLIMIT_RTTIME);
        hard = task_rlimit_max(p, RLIMIT_RTTIME);
@@ -1712,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
-        p->se.exec_start = rq->clock;
+        p->se.exec_start = rq->clock_task;
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 static inline void account_group_user_time(struct task_struct *tsk,
                                           cputime_t cputime)
 {
-        struct thread_group_cputimer *cputimer;
+        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        /* tsk == current, ensure it is safe to use ->signal */
-        if (unlikely(tsk->exit_state))
-                return;
-        cputimer = &tsk->signal->cputimer;
        if (!cputimer->running)
                return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
                                             cputime_t cputime)
 {
-        struct thread_group_cputimer *cputimer;
+        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        /* tsk == current, ensure it is safe to use ->signal */
-        if (unlikely(tsk->exit_state))
-                return;
-        cputimer = &tsk->signal->cputimer;
        if (!cputimer->running)
                return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
                                              unsigned long long ns)
 {
-        struct thread_group_cputimer *cputimer;
+        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        struct signal_struct *sig;
-        sig = tsk->signal;
-        /* see __exit_signal()->task_rq_unlock_wait() */
-        barrier();
-        if (unlikely(!sig))
-                return;
-        cputimer = &sig->cputimer;
        if (!cputimer->running)
                return;
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct rq *rq, struct task_struct *p,
+                    int sd_flag, int flags)
+{
+        return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+        resched_task(rq->curr); /* we preempt everything */
+}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+        struct task_struct *stop = rq->stop;
+        if (stop && stop->state == TASK_RUNNING)
+                return stop;
+        return NULL;
+}
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void yield_task_stop(struct rq *rq)
+{
+        BUG(); /* the stop task should never yield, its pointless. */
+}
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+static void switched_to_stop(struct rq *rq, struct task_struct *p,
+                             int running)
+{
+        BUG(); /* its impossible to change to this class */
+}
+static void prio_changed_stop(struct rq *rq, struct task_struct *p,
+                              int oldprio, int running)
+{
+        BUG(); /* how!?, what priority? */
+}
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+        return 0;
+}
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+static const struct sched_class stop_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_stop,
+        .dequeue_task           = dequeue_task_stop,
+        .yield_task             = yield_task_stop,
+        .check_preempt_curr     = check_preempt_curr_stop,
+        .pick_next_task         = pick_next_task_stop,
+        .put_prev_task          = put_prev_task_stop,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_stop,
+#endif
+        .set_curr_task          = set_curr_task_stop,
+        .task_tick              = task_tick_stop,
+        .get_rr_interval        = get_rr_interval_stop,
+        .prio_changed           = prio_changed_stop,
+        .switched_to            = switched_to_stop,
+        /* no .task_new for stop tasks */
+};
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
 /*
 * Bad permissions for sending the signal
- * - the caller must hold at least the RCU read lock
+ * - the caller must hold the RCU read lock
 */
 static int check_kill_permission(int sig, struct siginfo *info,
                                 struct task_struct *t)
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
        return count;
 }
-struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+                                           unsigned long *flags)
 {
        struct sighand_struct *sighand;
@@ -1127,11 +1128,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
 /*
 * send signal info to all the members of a group
- * - the caller must hold the RCU read lock at least
 */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-        int ret = check_kill_permission(sig, info, p);
+        int ret;
+        rcu_read_lock();
+        ret = check_kill_permission(sig, info, p);
+        rcu_read_unlock();
        if (!ret && sig)
                ret = do_send_sig_info(sig, info, p, true);
@@ -1614,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
 * is gone, we keep current->exit_code unless clear_code.
 */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+        __releases(&current->sighand->siglock)
+        __acquires(&current->sighand->siglock)
 {
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
@@ -2212,6 +2218,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 #ifdef __ARCH_SI_TRAPNO
                err |= __put_user(from->si_trapno, &to->si_trapno);
 #endif
+#ifdef BUS_MCEERR_AO
+                /* 
+                 * Other callers might not initialize the si_lsb field,
+                 * so check explicitely for the right codes here.
+                 */
+                if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+                        err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
                break;
        case __SI_CHLD:
                err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-#define ITERATOR_SHIFT          (BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR       (0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER        (~ITERATOR_SELECTOR)
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-        seq_puts(m, "Slow-work: New thread");
-}
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-        struct timespec now, diff;
-        now = CURRENT_TIME;
-        diff = timespec_sub(now, work->mark);
-        if (diff.tv_sec < 0)
-                seq_puts(m, "  -ve ");
-        else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-                seq_printf(m, "%3luns ", diff.tv_nsec);
-        else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-                seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-        else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-                seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-        else if (diff.tv_sec <= 1)
-                seq_puts(m, "   1s ");
-        else if (diff.tv_sec < 60)
-                seq_printf(m, "%4lus ", diff.tv_sec);
-        else if (diff.tv_sec < 60 * 60)
-                seq_printf(m, "%4lum ", diff.tv_sec / 60);
-        else if (diff.tv_sec < 60 * 60 * 24)
-                seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-        else
-                seq_puts(m, "exces ");
-}
-/*
- * Describe a slow work item for debugfs
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-        struct slow_work *work;
-        struct list_head *p = v;
-        unsigned long id;
-        switch ((unsigned long) v) {
-        case 1:
-                seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-                return 0;
-        case 2:
-                seq_puts(m, "=== ===== ================ == ===== ==========\n");
-                return 0;
-        case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-                id = (unsigned long) v - 3;
-                read_lock(&slow_work_execs_lock);
-                work = slow_work_execs[id];
-                if (work) {
-                        smp_read_barrier_depends();
-                        seq_printf(m, "%3lu %5d %16p %2lx ",
-                                   id, slow_work_pids[id], work, work->flags);
-                        slow_work_print_mark(m, work);
-                        if (work->ops->desc)
-                                work->ops->desc(work, m);
-                        seq_putc(m, '\n');
-                }
-                read_unlock(&slow_work_execs_lock);
-                return 0;
-        default:
-                work = list_entry(p, struct slow_work, link);
-                seq_printf(m, "%3s     - %16p %2lx ",
-                           work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-                           work, work->flags);
-                slow_work_print_mark(m, work);
-                if (work->ops->desc)
-                        work->ops->desc(work, m);
-                seq_putc(m, '\n');
-                return 0;
-        }
-}
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-        struct list_head *p;
-        unsigned long count, id;
-        switch (*_pos >> ITERATOR_SHIFT) {
-        case 0x0:
-                if (*_pos == 0)
-                        *_pos = 1;
-                if (*_pos < 3)
-                        return (void *)(unsigned long) *_pos;
-                if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-                        for (id = *_pos - 3;
-                             id < SLOW_WORK_THREAD_LIMIT;
-                             id++, (*_pos)++)
-                                if (slow_work_execs[id])
-                                        return (void *)(unsigned long) *_pos;
-                *_pos = 0x1UL << ITERATOR_SHIFT;
-        case 0x1:
-                count = *_pos & ITERATOR_COUNTER;
-                list_for_each(p, &slow_work_queue) {
-                        if (count == 0)
-                                return p;
-                        count--;
-                }
-                *_pos = 0x2UL << ITERATOR_SHIFT;
-        case 0x2:
-                count = *_pos & ITERATOR_COUNTER;
-                list_for_each(p, &vslow_work_queue) {
-                        if (count == 0)
-                                return p;
-                        count--;
-                }
-                *_pos = 0x3UL << ITERATOR_SHIFT;
-        default:
-                return NULL;
-        }
-}
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-        spin_lock_irq(&slow_work_queue_lock);
-        return slow_work_runqueue_index(m, _pos);
-}
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-        struct list_head *p = v;
-        unsigned long selector = *_pos >> ITERATOR_SHIFT;
-        (*_pos)++;
-        switch (selector) {
-        case 0x0:
-                return slow_work_runqueue_index(m, _pos);
-        case 0x1:
-                if (*_pos >> ITERATOR_SHIFT == 0x1) {
-                        p = p->next;
-                        if (p != &slow_work_queue)
-                                return p;
-                }
-                *_pos = 0x2UL << ITERATOR_SHIFT;
-                p = &vslow_work_queue;
-        case 0x2:
-                if (*_pos >> ITERATOR_SHIFT == 0x2) {
-                        p = p->next;
-                        if (p != &vslow_work_queue)
-                                return p;
-                }
-                *_pos = 0x3UL << ITERATOR_SHIFT;
-        default:
-                return NULL;
-        }
-}
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-        spin_unlock_irq(&slow_work_queue_lock);
-}
-static const struct seq_operations slow_work_runqueue_ops = {
-        .start          = slow_work_runqueue_start,
-        .stop           = slow_work_runqueue_stop,
-        .next           = slow_work_runqueue_next,
-        .show           = slow_work_runqueue_show,
-};
-/*
- * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &slow_work_runqueue_ops);
-}
-const struct file_operations slow_work_runqueue_fops = {
-        .owner          = THIS_MODULE,
-        .open           = slow_work_runqueue_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/debugfs.h>
-#include "slow-work.h"
-static void slow_work_cull_timeout(unsigned long);
-static void slow_work_oom_timeout(unsigned long);
-#ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int,
-                                        void __user *, size_t *, loff_t *);
-static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
-                                        void __user *, size_t *, loff_t *);
-#endif
-/*
- * The pool of threads has at least min threads in it as long as someone is
- * using the facility, and may have as many as max.
- *
- * A portion of the pool may be processing very slow operations.
- */
-static unsigned slow_work_min_threads = 2;
-static unsigned slow_work_max_threads = 4;
-static unsigned vslow_work_proportion = 50; /* % of threads that may process
-                                             * very slow work */
-#ifdef CONFIG_SYSCTL
-static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
-static const int slow_work_min_vslow = 1;
-static const int slow_work_max_vslow = 99;
-ctl_table slow_work_sysctls[] = {
-        {
-                .procname       = "min-threads",
-                .data           = &slow_work_min_threads,
-                .maxlen         = sizeof(unsigned),
-                .mode           = 0644,
-                .proc_handler   = slow_work_min_threads_sysctl,
-                .extra1         = (void *) &slow_work_min_min_threads,
-                .extra2         = &slow_work_max_threads,
-        },
-        {
-                .procname       = "max-threads",
-                .data           = &slow_work_max_threads,
-                .maxlen         = sizeof(unsigned),
-                .mode           = 0644,
-                .proc_handler   = slow_work_max_threads_sysctl,
-                .extra1         = &slow_work_min_threads,
-                .extra2         = (void *) &slow_work_max_max_threads,
-        },
-        {
-                .procname       = "vslow-percentage",
-                .data           = &vslow_work_proportion,
-                .maxlen         = sizeof(unsigned),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = (void *) &slow_work_min_vslow,
-                .extra2         = (void *) &slow_work_max_vslow,
-        },
-        {}
-};
-#endif
-/*
- * The active state of the thread pool
- */
-static atomic_t slow_work_thread_count;
-static atomic_t vslow_work_executing_count;
-static bool slow_work_may_not_start_new_thread;
-static bool slow_work_cull; /* cull a thread due to lack of activity */
-static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
-static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
-static struct slow_work slow_work_new_thread; /* new thread starter */
-/*
- * slow work ID allocation (use slow_work_queue_lock)
- */
-static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-/*
- * Unregistration tracking to prevent put_ref() from disappearing during module
- * unload
- */
-#ifdef CONFIG_MODULES
-static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
-static struct module *slow_work_unreg_module;
-static struct slow_work *slow_work_unreg_work_item;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
-static DEFINE_MUTEX(slow_work_unreg_sync_lock);
-static void slow_work_set_thread_processing(int id, struct slow_work *work)
-{
-        if (work)
-                slow_work_thread_processing[id] = work->owner;
-}
-static void slow_work_done_thread_processing(int id, struct slow_work *work)
-{
-        struct module *module = slow_work_thread_processing[id];
-        slow_work_thread_processing[id] = NULL;
-        smp_mb();
-        if (slow_work_unreg_work_item == work ||
-            slow_work_unreg_module == module)
-                wake_up_all(&slow_work_unreg_wq);
-}
-static void slow_work_clear_thread_processing(int id)
-{
-        slow_work_thread_processing[id] = NULL;
-}
-#else
-static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_clear_thread_processing(int id) {}
-#endif
-/*
- * Data for tracking currently executing items for indication through /proc
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
-pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
-DEFINE_RWLOCK(slow_work_execs_lock);
-#endif
-/*
- * The queues of work items and the lock governing access to them.  These are
- * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
- * as the number of threads bears no relation to the number of CPUs.
- *
- * There are two queues of work items: one for slow work items, and one for
- * very slow work items.
- */
-LIST_HEAD(slow_work_queue);
-LIST_HEAD(vslow_work_queue);
-DEFINE_SPINLOCK(slow_work_queue_lock);
-/*
- * The following are two wait queues that get pinged when a work item is placed
- * on an empty queue.  These allow work items that are hogging a thread by
- * sleeping in a way that could be deferred to yield their thread and enqueue
- * themselves.
- */
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
-static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
-/*
- * The thread controls.  A variable used to signal to the threads that they
- * should exit when the queue is empty, a waitqueue used by the threads to wait
- * for signals, and a completion set by the last thread to exit.
- */
-static bool slow_work_threads_should_exit;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
-static DECLARE_COMPLETION(slow_work_last_thread_exited);
-/*
- * The number of users of the thread pool and its lock.  Whilst this is zero we
- * have no threads hanging around, and when this reaches zero, we wait for all
- * active or queued work items to complete and kill all the threads we do have.
- */
-static int slow_work_user_count;
-static DEFINE_MUTEX(slow_work_user_lock);
-static inline int slow_work_get_ref(struct slow_work *work)
-{
-        if (work->ops->get_ref)
-                return work->ops->get_ref(work);
-        return 0;
-}
-static inline void slow_work_put_ref(struct slow_work *work)
-{
-        if (work->ops->put_ref)
-                work->ops->put_ref(work);
-}
-/*
- * Calculate the maximum number of active threads in the pool that are
- * permitted to process very slow work items.
- *
- * The answer is rounded up to at least 1, but may not equal or exceed the
- * maximum number of the threads in the pool.  This means we always have at
- * least one thread that can process slow work items, and we always have at
- * least one thread that won't get tied up doing so.
- */
-static unsigned slow_work_calc_vsmax(void)
-{
-        unsigned vsmax;
-        vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
-        vsmax /= 100;
-        vsmax = max(vsmax, 1U);
-        return min(vsmax, slow_work_max_threads - 1);
-}
-/*
- * Attempt to execute stuff queued on a slow thread.  Return true if we managed
- * it, false if there was nothing to do.
- */
-static noinline bool slow_work_execute(int id)
-{
-        struct slow_work *work = NULL;
-        unsigned vsmax;
-        bool very_slow;
-        vsmax = slow_work_calc_vsmax();
-        /* see if we can schedule a new thread to be started if we're not
-         * keeping up with the work */
-        if (!waitqueue_active(&slow_work_thread_wq) &&
-            (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
-            atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
-            !slow_work_may_not_start_new_thread)
-                slow_work_enqueue(&slow_work_new_thread);
-        /* find something to execute */
-        spin_lock_irq(&slow_work_queue_lock);
-        if (!list_empty(&vslow_work_queue) &&
-            atomic_read(&vslow_work_executing_count) < vsmax) {
-                work = list_entry(vslow_work_queue.next,
-                                  struct slow_work, link);
-                if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-                        BUG();
-                list_del_init(&work->link);
-                atomic_inc(&vslow_work_executing_count);
-                very_slow = true;
-        } else if (!list_empty(&slow_work_queue)) {
-                work = list_entry(slow_work_queue.next,
-                                  struct slow_work, link);
-                if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-                        BUG();
-                list_del_init(&work->link);
-                very_slow = false;
-        } else {
-                very_slow = false; /* avoid the compiler warning */
-        }
-        slow_work_set_thread_processing(id, work);
-        if (work) {
-                slow_work_mark_time(work);
-                slow_work_begin_exec(id, work);
-        }
-        spin_unlock_irq(&slow_work_queue_lock);
-        if (!work)
-                return false;
-        if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
-                BUG();
-        /* don't execute if the work is in the process of being cancelled */
-        if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
-                work->ops->execute(work);
-        if (very_slow)
-                atomic_dec(&vslow_work_executing_count);
-        clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
-        /* wake up anyone waiting for this work to be complete */
-        wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
-        slow_work_end_exec(id, work);
-        /* if someone tried to enqueue the item whilst we were executing it,
-         * then it'll be left unenqueued to avoid multiple threads trying to
-         * execute it simultaneously
-         *
-         * there is, however, a race between us testing the pending flag and
-         * getting the spinlock, and between the enqueuer setting the pending
-         * flag and getting the spinlock, so we use a deferral bit to tell us
-         * if the enqueuer got there first
-         */
-        if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
-                spin_lock_irq(&slow_work_queue_lock);
-                if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
-                    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
-                        goto auto_requeue;
-                spin_unlock_irq(&slow_work_queue_lock);
-        }
-        /* sort out the race between module unloading and put_ref() */
-        slow_work_put_ref(work);
-        slow_work_done_thread_processing(id, work);
-        return true;
-auto_requeue:
-        /* we must complete the enqueue operation
-         * - we transfer our ref on the item back to the appropriate queue
-         * - don't wake another thread up as we're awake already
-         */
-        slow_work_mark_time(work);
-        if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-                list_add_tail(&work->link, &vslow_work_queue);
-        else
-                list_add_tail(&work->link, &slow_work_queue);
-        spin_unlock_irq(&slow_work_queue_lock);
-        slow_work_clear_thread_processing(id);
-        return true;
-}
-/**
- * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
- * work: The work item under execution that wants to sleep
- * _timeout: Scheduler sleep timeout
- *
- * Allow a requeueable work item to sleep on a slow-work processor thread until
- * that thread is needed to do some other work or the sleep is interrupted by
- * some other event.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * False is returned if there is nothing on the queue; true is returned if the
- * work item should be requeued
- */
-bool slow_work_sleep_till_thread_needed(struct slow_work *work,
-                                        signed long *_timeout)
-{
-        wait_queue_head_t *wfo_wq;
-        struct list_head *queue;
-        DEFINE_WAIT(wait);
-        if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-                wfo_wq = &vslow_work_queue_waits_for_occupation;
-                queue = &vslow_work_queue;
-        } else {
-                wfo_wq = &slow_work_queue_waits_for_occupation;
-                queue = &slow_work_queue;
-        }
-        if (!list_empty(queue))
-                return true;
-        add_wait_queue_exclusive(wfo_wq, &wait);
-        if (list_empty(queue))
-                *_timeout = schedule_timeout(*_timeout);
-        finish_wait(wfo_wq, &wait);
-        return !list_empty(queue);
-}
-EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
-/**
- * slow_work_enqueue - Schedule a slow work item for processing
- * @work: The work item to queue
- *
- * Schedule a slow work item for processing.  If the item is already undergoing
- * execution, this guarantees not to re-enter the execution routine until the
- * first execution finishes.
- *
- * The item is pinned by this function as it retains a reference to it, managed
- * through the item operations.  The item is unpinned once it has been
- * executed.
- *
- * An item may hog the thread that is running it for a relatively large amount
- * of time, sufficient, for example, to perform several lookup, mkdir, create
- * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
- *
- * Conversely, if a number of items are awaiting processing, it may take some
- * time before any given item is given attention.  The number of threads in the
- * pool may be increased to deal with demand, but only up to a limit.
- *
- * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
- * the very slow queue, from which only a portion of the threads will be
- * allowed to pick items to execute.  This ensures that very slow items won't
- * overly block ones that are just ordinarily slow.
- *
- * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
- * attempted queued)
- */
-int slow_work_enqueue(struct slow_work *work)
-{
-        wait_queue_head_t *wfo_wq;
-        struct list_head *queue;
-        unsigned long flags;
-        int ret;
-        if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-                return -ECANCELED;
-        BUG_ON(slow_work_user_count <= 0);
-        BUG_ON(!work);
-        BUG_ON(!work->ops);
-        /* when honouring an enqueue request, we only promise that we will run
-         * the work function in the future; we do not promise to run it once
-         * per enqueue request
-         *
-         * we use the PENDING bit to merge together repeat requests without
-         * having to disable IRQs and take the spinlock, whilst still
-         * maintaining our promise
-         */
-        if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-                if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-                        wfo_wq = &vslow_work_queue_waits_for_occupation;
-                        queue = &vslow_work_queue;
-                } else {
-                        wfo_wq = &slow_work_queue_waits_for_occupation;
-                        queue = &slow_work_queue;
-                }
-                spin_lock_irqsave(&slow_work_queue_lock, flags);
-                if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
-                        goto cancelled;
-                /* we promise that we will not attempt to execute the work
-                 * function in more than one thread simultaneously
-                 *
-                 * this, however, leaves us with a problem if we're asked to
-                 * enqueue the work whilst someone is executing the work
-                 * function as simply queueing the work immediately means that
-                 * another thread may try executing it whilst it is already
-                 * under execution
-                 *
-                 * to deal with this, we set the ENQ_DEFERRED bit instead of
-                 * enqueueing, and the thread currently executing the work
-                 * function will enqueue the work item when the work function
-                 * returns and it has cleared the EXECUTING bit
-                 */
-                if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-                        set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-                } else {
-                        ret = slow_work_get_ref(work);
-                        if (ret < 0)
-                                goto failed;
-                        slow_work_mark_time(work);
-                        list_add_tail(&work->link, queue);
-                        wake_up(&slow_work_thread_wq);
-                        /* if someone who could be requeued is sleeping on a
-                         * thread, then ask them to yield their thread */
-                        if (work->link.prev == queue)
-                                wake_up(wfo_wq);
-                }
-                spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-        }
-        return 0;
-cancelled:
-        ret = -ECANCELED;
-failed:
-        spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(slow_work_enqueue);
-static int slow_work_wait(void *word)
-{
-        schedule();
-        return 0;
-}
-/**
- * slow_work_cancel - Cancel a slow work item
- * @work: The work item to cancel
- *
- * This function will cancel a previously enqueued work item. If we cannot
- * cancel the work item, it is guarenteed to have run when this function
- * returns.
- */
-void slow_work_cancel(struct slow_work *work)
-{
-        bool wait = true, put = false;
-        set_bit(SLOW_WORK_CANCELLING, &work->flags);
-        smp_mb();
-        /* if the work item is a delayed work item with an active timer, we
-         * need to wait for the timer to finish _before_ getting the spinlock,
-         * lest we deadlock against the timer routine
-         *
-         * the timer routine will leave DELAYED set if it notices the
-         * CANCELLING flag in time
-         */
-        if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-                struct delayed_slow_work *dwork =
-                        container_of(work, struct delayed_slow_work, work);
-                del_timer_sync(&dwork->timer);
-        }
-        spin_lock_irq(&slow_work_queue_lock);
-        if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-                /* the timer routine aborted or never happened, so we are left
-                 * holding the timer's reference on the item and should just
-                 * drop the pending flag and wait for any ongoing execution to
-                 * finish */
-                struct delayed_slow_work *dwork =
-                        container_of(work, struct delayed_slow_work, work);
-                BUG_ON(timer_pending(&dwork->timer));
-                BUG_ON(!list_empty(&work->link));
-                clear_bit(SLOW_WORK_DELAYED, &work->flags);
-                put = true;
-                clear_bit(SLOW_WORK_PENDING, &work->flags);
-        } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-                   !list_empty(&work->link)) {
-                /* the link in the pending queue holds a reference on the item
-                 * that we will need to release */
-                list_del_init(&work->link);
-                wait = false;
-                put = true;
-                clear_bit(SLOW_WORK_PENDING, &work->flags);
-        } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
-                /* the executor is holding our only reference on the item, so
-                 * we merely need to wait for it to finish executing */
-                clear_bit(SLOW_WORK_PENDING, &work->flags);
-        }
-        spin_unlock_irq(&slow_work_queue_lock);
-        /* the EXECUTING flag is set by the executor whilst the spinlock is set
-         * and before the item is dequeued - so assuming the above doesn't
-         * actually dequeue it, simply waiting for the EXECUTING flag to be
-         * released here should be sufficient */
-        if (wait)
-                wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
-                            TASK_UNINTERRUPTIBLE);
-        clear_bit(SLOW_WORK_CANCELLING, &work->flags);
-        if (put)
-                slow_work_put_ref(work);
-}
-EXPORT_SYMBOL(slow_work_cancel);
-/*
- * Handle expiry of the delay timer, indicating that a delayed slow work item
- * should now be queued if not cancelled
- */
-static void delayed_slow_work_timer(unsigned long data)
-{
-        wait_queue_head_t *wfo_wq;
-        struct list_head *queue;
-        struct slow_work *work = (struct slow_work *) data;
-        unsigned long flags;
-        bool queued = false, put = false, first = false;
-        if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-                wfo_wq = &vslow_work_queue_waits_for_occupation;
-                queue = &vslow_work_queue;
-        } else {
-                wfo_wq = &slow_work_queue_waits_for_occupation;
-                queue = &slow_work_queue;
-        }
-        spin_lock_irqsave(&slow_work_queue_lock, flags);
-        if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
-                clear_bit(SLOW_WORK_DELAYED, &work->flags);
-                if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-                        /* we discard the reference the timer was holding in
-                         * favour of the one the executor holds */
-                        set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-                        put = true;
-                } else {
-                        slow_work_mark_time(work);
-                        list_add_tail(&work->link, queue);
-                        queued = true;
-                        if (work->link.prev == queue)
-                                first = true;
-                }
-        }
-        spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-        if (put)
-                slow_work_put_ref(work);
-        if (first)
-                wake_up(wfo_wq);
-        if (queued)
-                wake_up(&slow_work_thread_wq);
-}
-/**
- * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
- * @dwork: The delayed work item to queue
- * @delay: When to start executing the work, in jiffies from now
- *
- * This is similar to slow_work_enqueue(), but it adds a delay before the work
- * is actually queued for processing.
- *
- * The item can have delayed processing requested on it whilst it is being
- * executed.  The delay will begin immediately, and if it expires before the
- * item finishes executing, the item will be placed back on the queue when it
- * has done executing.
- */
-int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
-                              unsigned long delay)
-{
-        struct slow_work *work = &dwork->work;
-        unsigned long flags;
-        int ret;
-        if (delay == 0)
-                return slow_work_enqueue(&dwork->work);
-        BUG_ON(slow_work_user_count <= 0);
-        BUG_ON(!work);
-        BUG_ON(!work->ops);
-        if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-                return -ECANCELED;
-        if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-                spin_lock_irqsave(&slow_work_queue_lock, flags);
-                if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-                        goto cancelled;
-                /* the timer holds a reference whilst it is pending */
-                ret = slow_work_get_ref(work);
-                if (ret < 0)
-                        goto cant_get_ref;
-                if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
-                        BUG();
-                dwork->timer.expires = jiffies + delay;
-                dwork->timer.data = (unsigned long) work;
-                dwork->timer.function = delayed_slow_work_timer;
-                add_timer(&dwork->timer);
-                spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-        }
-        return 0;
-cancelled:
-        ret = -ECANCELED;
-cant_get_ref:
-        spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(delayed_slow_work_enqueue);
-/*
- * Schedule a cull of the thread pool at some time in the near future
- */
-static void slow_work_schedule_cull(void)
-{
-        mod_timer(&slow_work_cull_timer,
-                  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
-}
-/*
- * Worker thread culling algorithm
- */
-static bool slow_work_cull_thread(void)
-{
-        unsigned long flags;
-        bool do_cull = false;
-        spin_lock_irqsave(&slow_work_queue_lock, flags);
-        if (slow_work_cull) {
-                slow_work_cull = false;
-                if (list_empty(&slow_work_queue) &&
-                    list_empty(&vslow_work_queue) &&
-                    atomic_read(&slow_work_thread_count) >
-                    slow_work_min_threads) {
-                        slow_work_schedule_cull();
-                        do_cull = true;
-                }
-        }
-        spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-        return do_cull;
-}
-/*
- * Determine if there is slow work available for dispatch
- */
-static inline bool slow_work_available(int vsmax)
-{
-        return !list_empty(&slow_work_queue) ||
-                (!list_empty(&vslow_work_queue) &&
-                 atomic_read(&vslow_work_executing_count) < vsmax);
-}
-/*
- * Worker thread dispatcher
- */
-static int slow_work_thread(void *_data)
-{
-        int vsmax, id;
-        DEFINE_WAIT(wait);
-        set_freezable();
-        set_user_nice(current, -5);
-        /* allocate ourselves an ID */
-        spin_lock_irq(&slow_work_queue_lock);
-        id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-        BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
-        __set_bit(id, slow_work_ids);
-        slow_work_set_thread_pid(id, current->pid);
-        spin_unlock_irq(&slow_work_queue_lock);
-        sprintf(current->comm, "kslowd%03u", id);
-        for (;;) {
-                vsmax = vslow_work_proportion;
-                vsmax *= atomic_read(&slow_work_thread_count);
-                vsmax /= 100;
-                prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
-                                          TASK_INTERRUPTIBLE);
-                if (!freezing(current) &&
-                    !slow_work_threads_should_exit &&
-                    !slow_work_available(vsmax) &&
-                    !slow_work_cull)
-                        schedule();
-                finish_wait(&slow_work_thread_wq, &wait);
-                try_to_freeze();
-                vsmax = vslow_work_proportion;
-                vsmax *= atomic_read(&slow_work_thread_count);
-                vsmax /= 100;
-                if (slow_work_available(vsmax) && slow_work_execute(id)) {
-                        cond_resched();
-                        if (list_empty(&slow_work_queue) &&
-                            list_empty(&vslow_work_queue) &&
-                            atomic_read(&slow_work_thread_count) >
-                            slow_work_min_threads)
-                                slow_work_schedule_cull();
-                        continue;
-                }
-                if (slow_work_threads_should_exit)
-                        break;
-                if (slow_work_cull && slow_work_cull_thread())
-                        break;
-        }
-        spin_lock_irq(&slow_work_queue_lock);
-        slow_work_set_thread_pid(id, 0);
-        __clear_bit(id, slow_work_ids);
-        spin_unlock_irq(&slow_work_queue_lock);
-        if (atomic_dec_and_test(&slow_work_thread_count))
-                complete_and_exit(&slow_work_last_thread_exited, 0);
-        return 0;
-}
-/*
- * Handle thread cull timer expiration
- */
-static void slow_work_cull_timeout(unsigned long data)
-{
-        slow_work_cull = true;
-        wake_up(&slow_work_thread_wq);
-}
-/*
- * Start a new slow work thread
- */
-static void slow_work_new_thread_execute(struct slow_work *work)
-{
-        struct task_struct *p;
-        if (slow_work_threads_should_exit)
-                return;
-        if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
-                return;
-        if (!mutex_trylock(&slow_work_user_lock))
-                return;
-        slow_work_may_not_start_new_thread = true;
-        atomic_inc(&slow_work_thread_count);
-        p = kthread_run(slow_work_thread, NULL, "kslowd");
-        if (IS_ERR(p)) {
-                printk(KERN_DEBUG "Slow work thread pool: OOM\n");
-                if (atomic_dec_and_test(&slow_work_thread_count))
-                        BUG(); /* we're running on a slow work thread... */
-                mod_timer(&slow_work_oom_timer,
-                          round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
-        } else {
-                /* ratelimit the starting of new threads */
-                mod_timer(&slow_work_oom_timer, jiffies + 1);
-        }
-        mutex_unlock(&slow_work_user_lock);
-}
-static const struct slow_work_ops slow_work_new_thread_ops = {
-        .owner          = THIS_MODULE,
-        .execute        = slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        .desc           = slow_work_new_thread_desc,
-#endif
-};
-/*
- * post-OOM new thread start suppression expiration
- */
-static void slow_work_oom_timeout(unsigned long data)
-{
-        slow_work_may_not_start_new_thread = false;
-}
-#ifdef CONFIG_SYSCTL
-/*
- * Handle adjustment of the minimum number of threads
- */
-static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-                                        void __user *buffer,
-                                        size_t *lenp, loff_t *ppos)
-{
-        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        int n;
-        if (ret == 0) {
-                mutex_lock(&slow_work_user_lock);
-                if (slow_work_user_count > 0) {
-                        /* see if we need to start or stop threads */
-                        n = atomic_read(&slow_work_thread_count) -
-                                slow_work_min_threads;
-                        if (n < 0 && !slow_work_may_not_start_new_thread)
-                                slow_work_enqueue(&slow_work_new_thread);
-                        else if (n > 0)
-                                slow_work_schedule_cull();
-                }
-                mutex_unlock(&slow_work_user_lock);
-        }
-        return ret;
-}
-/*
- * Handle adjustment of the maximum number of threads
- */
-static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-                                        void __user *buffer,
-                                        size_t *lenp, loff_t *ppos)
-{
-        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        int n;
-        if (ret == 0) {
-                mutex_lock(&slow_work_user_lock);
-                if (slow_work_user_count > 0) {
-                        /* see if we need to stop threads */
-                        n = slow_work_max_threads -
-                                atomic_read(&slow_work_thread_count);
-                        if (n < 0)
-                                slow_work_schedule_cull();
-                }
-                mutex_unlock(&slow_work_user_lock);
-        }
-        return ret;
-}
-#endif /* CONFIG_SYSCTL */
-/**
- * slow_work_register_user - Register a user of the facility
- * @module: The module about to make use of the facility
- *
- * Register a user of the facility, starting up the initial threads if there
- * aren't any other users at this point.  This will return 0 if successful, or
- * an error if not.
- */
-int slow_work_register_user(struct module *module)
-{
-        struct task_struct *p;
-        int loop;
-        mutex_lock(&slow_work_user_lock);
-        if (slow_work_user_count == 0) {
-                printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
-                init_completion(&slow_work_last_thread_exited);
-                slow_work_threads_should_exit = false;
-                slow_work_init(&slow_work_new_thread,
-                               &slow_work_new_thread_ops);
-                slow_work_may_not_start_new_thread = false;
-                slow_work_cull = false;
-                /* start the minimum number of threads */
-                for (loop = 0; loop < slow_work_min_threads; loop++) {
-                        atomic_inc(&slow_work_thread_count);
-                        p = kthread_run(slow_work_thread, NULL, "kslowd");
-                        if (IS_ERR(p))
-                                goto error;
-                }
-                printk(KERN_NOTICE "Slow work thread pool: Ready\n");
-        }
-        slow_work_user_count++;
-        mutex_unlock(&slow_work_user_lock);
-        return 0;
-error:
-        if (atomic_dec_and_test(&slow_work_thread_count))
-                complete(&slow_work_last_thread_exited);
-        if (loop > 0) {
-                printk(KERN_ERR "Slow work thread pool:"
-                       " Aborting startup on ENOMEM\n");
-                slow_work_threads_should_exit = true;
-                wake_up_all(&slow_work_thread_wq);
-                wait_for_completion(&slow_work_last_thread_exited);
-                printk(KERN_ERR "Slow work thread pool: Aborted\n");
-        }
-        mutex_unlock(&slow_work_user_lock);
-        return PTR_ERR(p);
-}
-EXPORT_SYMBOL(slow_work_register_user);
-/*
- * wait for all outstanding items from the calling module to complete
- * - note that more items may be queued whilst we're waiting
- */
-static void slow_work_wait_for_items(struct module *module)
-{
-#ifdef CONFIG_MODULES
-        DECLARE_WAITQUEUE(myself, current);
-        struct slow_work *work;
-        int loop;
-        mutex_lock(&slow_work_unreg_sync_lock);
-        add_wait_queue(&slow_work_unreg_wq, &myself);
-        for (;;) {
-                spin_lock_irq(&slow_work_queue_lock);
-                /* first of all, we wait for the last queued item in each list
-                 * to be processed */
-                list_for_each_entry_reverse(work, &vslow_work_queue, link) {
-                        if (work->owner == module) {
-                                set_current_state(TASK_UNINTERRUPTIBLE);
-                                slow_work_unreg_work_item = work;
-                                goto do_wait;
-                        }
-                }
-                list_for_each_entry_reverse(work, &slow_work_queue, link) {
-                        if (work->owner == module) {
-                                set_current_state(TASK_UNINTERRUPTIBLE);
-                                slow_work_unreg_work_item = work;
-                                goto do_wait;
-                        }
-                }
-                /* then we wait for the items being processed to finish */
-                slow_work_unreg_module = module;
-                smp_mb();
-                for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
-                        if (slow_work_thread_processing[loop] == module)
-                                goto do_wait;
-                }
-                spin_unlock_irq(&slow_work_queue_lock);
-                break; /* okay, we're done */
-        do_wait:
-                spin_unlock_irq(&slow_work_queue_lock);
-                schedule();
-                slow_work_unreg_work_item = NULL;
-                slow_work_unreg_module = NULL;
-        }
-        remove_wait_queue(&slow_work_unreg_wq, &myself);
-        mutex_unlock(&slow_work_unreg_sync_lock);
-#endif /* CONFIG_MODULES */
-}
-/**
- * slow_work_unregister_user - Unregister a user of the facility
- * @module: The module whose items should be cleared
- *
- * Unregister a user of the facility, killing all the threads if this was the
- * last one.
- *
- * This waits for all the work items belonging to the nominated module to go
- * away before proceeding.
- */
-void slow_work_unregister_user(struct module *module)
-{
-        /* first of all, wait for all outstanding items from the calling module
-         * to complete */
-        if (module)
-                slow_work_wait_for_items(module);
-        /* then we can actually go about shutting down the facility if need
-         * be */
-        mutex_lock(&slow_work_user_lock);
-        BUG_ON(slow_work_user_count <= 0);
-        slow_work_user_count--;
-        if (slow_work_user_count == 0) {
-                printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
-                slow_work_threads_should_exit = true;
-                del_timer_sync(&slow_work_cull_timer);
-                del_timer_sync(&slow_work_oom_timer);
-                wake_up_all(&slow_work_thread_wq);
-                wait_for_completion(&slow_work_last_thread_exited);
-                printk(KERN_NOTICE "Slow work thread pool:"
-                       " Shut down complete\n");
-        }
-        mutex_unlock(&slow_work_user_lock);
-}
-EXPORT_SYMBOL(slow_work_unregister_user);
-/*
- * Initialise the slow work facility
- */
-static int __init init_slow_work(void)
-{
-        unsigned nr_cpus = num_possible_cpus();
-        if (slow_work_max_threads < nr_cpus)
-                slow_work_max_threads = nr_cpus;
-#ifdef CONFIG_SYSCTL
-        if (slow_work_max_max_threads < nr_cpus * 2)
-                slow_work_max_max_threads = nr_cpus * 2;
-#endif
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        {
-                struct dentry *dbdir;
-                dbdir = debugfs_create_dir("slow_work", NULL);
-                if (dbdir && !IS_ERR(dbdir))
-                        debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
-                                            NULL, &slow_work_runqueue_fops);
-        }
-#endif
-        return 0;
-}
-subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Slow work private definitions
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
-                                         * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)  /* can't start new threads for 5s after
-                                         * OOM */
-#define SLOW_WORK_THREAD_LIMIT  255     /* abs maximum number of slow-work threads */
-/*
- * slow-work.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern struct slow_work *slow_work_execs[];
-extern pid_t slow_work_pids[];
-extern rwlock_t slow_work_execs_lock;
-#endif
-extern struct list_head slow_work_queue;
-extern struct list_head vslow_work_queue;
-extern spinlock_t slow_work_queue_lock;
-/*
- * slow-work-debugfs.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern const struct file_operations slow_work_runqueue_fops;
-extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-#endif
-/*
- * Helper functions
- */
-static inline void slow_work_set_thread_pid(int id, pid_t pid)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        slow_work_pids[id] = pid;
-#endif
-}
-static inline void slow_work_mark_time(struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        work->mark = CURRENT_TIME;
-#endif
-}
-static inline void slow_work_begin_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        slow_work_execs[id] = work;
-#endif
-}
-static inline void slow_work_end_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        write_lock(&slow_work_execs_lock);
-        slow_work_execs[id] = NULL;
-        write_unlock(&slow_work_execs_lock);
-#endif
-}
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 *
 * Returns 0 on success, else a negative status code.
 */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
 {
        struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
 *      3) any other online cpu in @mask
 */
 int smp_call_function_any(const struct cpumask *mask,
-                          void (*func)(void *info), void *info, int wait)
+                          smp_call_func_t func, void *info, int wait)
 {
        unsigned int cpu;
        const struct cpumask *nodemask;
@@ -365,9 +365,10 @@ call:
 EXPORT_SYMBOL_GPL(smp_call_function_any);
 /**
- * __smp_call_function_single(): Run a function on another CPU
+ * __smp_call_function_single(): Run a function on a specific CPU
 * @cpu: The CPU to run on.
 * @data: Pre-allocated and setup data structure
+ * @wait: If true, wait until function has completed on specified CPU.
 *
 * Like smp_call_function_single(), but allow caller to pass in a
 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
 void __smp_call_function_single(int cpu, struct call_single_data *data,
                                int wait)
 {
-        csd_lock(data);
+        unsigned int this_cpu;
+        unsigned long flags;
+        this_cpu = get_cpu();
        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
        WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
                     && !oops_in_progress);
-        generic_exec_single(cpu, data, wait);
+        if (cpu == this_cpu) {
+                local_irq_save(flags);
+                data->func(data->info);
+                local_irq_restore(flags);
+        } else {
+                csd_lock(data);
+                generic_exec_single(cpu, data, wait);
+        }
+        put_cpu();
 }
 /**
@@ -405,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 * must be disabled when calling this function.
 */
 void smp_call_function_many(const struct cpumask *mask,
-                            void (*func)(void *), void *info, bool wait)
+                            smp_call_func_t func, void *info, bool wait)
 {
        struct call_function_data *data;
        unsigned long flags;
@@ -489,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
-int smp_call_function(void (*func)(void *), void *info, int wait)
+int smp_call_function(smp_call_func_t func, void *info, int wait)
 {
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 * to the pending events, so lets the scheduler to balance
 * the softirq load for us.
 */
-void wakeup_softirqd(void)
+static void wakeup_softirqd(void)
 {
        /* Interrupts are disabled: no need to stop preemption */
        struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
 }
 /*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ *   softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ *   on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+/*
 * This one is for softirq.c-internal use,
 * where hardirqs are disabled legitimately:
 */
 #ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
        unsigned long flags;
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
         * We must manually increment preempt_count here and manually
         * call the trace_preempt_off later.
         */
-        preempt_count() += SOFTIRQ_OFFSET;
+        preempt_count() += cnt;
        /*
         * Were softirqs turned off above:
         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
+        if (softirq_count() == cnt)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
-        if (preempt_count() == SOFTIRQ_OFFSET)
+        if (preempt_count() == cnt)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-        add_preempt_count(SOFTIRQ_OFFSET);
+        add_preempt_count(cnt);
        barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
 void local_bh_disable(void)
 {
-        __local_bh_disable((unsigned long)__builtin_return_address(0));
+        __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(local_bh_disable);
+static void __local_bh_enable(unsigned int cnt)
+{
+        WARN_ON_ONCE(in_irq());
+        WARN_ON_ONCE(!irqs_disabled());
+        if (softirq_count() == cnt)
+                trace_softirqs_on((unsigned long)__builtin_return_address(0));
+        sub_preempt_count(cnt);
+}
 /*
 * Special-case - softirqs can safely be enabled in
 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
 */
 void _local_bh_enable(void)
 {
-        WARN_ON_ONCE(in_irq());
+        __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
-        WARN_ON_ONCE(!irqs_disabled());
-        if (softirq_count() == SOFTIRQ_OFFSET)
-                trace_softirqs_on((unsigned long)__builtin_return_address(0));
-        sub_preempt_count(SOFTIRQ_OFFSET);
 }
 EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
        /*
         * Are softirqs going to be turned on now:
         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
+        if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
                trace_softirqs_on(ip);
        /*
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        sub_preempt_count(SOFTIRQ_OFFSET - 1);
+        sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
        if (unlikely(!in_interrupt() && local_softirq_pending()))
                do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
        pending = local_softirq_pending();
        account_system_vtime(current);
-        __local_bh_disable((unsigned long)__builtin_return_address(0));
+        __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
        lockdep_softirq_enter();
        cpu = smp_processor_id();
@@ -212,18 +229,20 @@ restart:
        do {
                if (pending & 1) {
+                        unsigned int vec_nr = h - softirq_vec;
                        int prev_count = preempt_count();
-                        kstat_incr_softirqs_this_cpu(h - softirq_vec);
-                        trace_softirq_entry(h, softirq_vec);
+                        kstat_incr_softirqs_this_cpu(vec_nr);
+                        trace_softirq_entry(vec_nr);
                        h->action(h);
-                        trace_softirq_exit(h, softirq_vec);
+                        trace_softirq_exit(vec_nr);
                        if (unlikely(prev_count != preempt_count())) {
-                                printk(KERN_ERR "huh, entered softirq %td %s %p"
+                                printk(KERN_ERR "huh, entered softirq %u %s %p"
                                       "with preempt_count %08x,"
-                                       " exited with %08x?\n", h - softirq_vec,
+                                       " exited with %08x?\n", vec_nr,
-                                       softirq_to_name[h - softirq_vec],
+                                       softirq_to_name[vec_nr], h->action,
-                                       h->action, prev_count, preempt_count());
+                                       prev_count, preempt_count());
                                preempt_count() = prev_count;
                        }
@@ -245,7 +264,7 @@ restart:
        lockdep_softirq_exit();
        account_system_vtime(current);
-        _local_bh_enable();
+        __local_bh_enable(SOFTIRQ_OFFSET);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +298,16 @@ void irq_enter(void)
        rcu_irq_enter();
        if (idle_cpu(cpu) && !in_interrupt()) {
-                __irq_enter();
+                /*
+                 * Prevent raise_softirq from needlessly waking up ksoftirqd
+                 * here, as softirq will be serviced on return from interrupt.
+                 */
+                local_bh_disable();
                tick_check_idle(cpu);
-        } else
+                _local_bh_enable();
-                __irq_enter();
+        }
+        __irq_enter();
 }
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +721,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
        set_current_state(TASK_INTERRUPTIBLE);
+        current->flags |= PF_KSOFTIRQD;
        while (!kthread_should_stop()) {
                preempt_disable();
                if (!local_softirq_pending()) {
@@ -886,17 +912,14 @@ int __init __weak early_irq_init(void)
        return 0;
 }
+#ifdef CONFIG_GENERIC_HARDIRQS
 int __init __weak arch_probe_nr_irqs(void)
 {
-        return 0;
+        return NR_IRQS_LEGACY;
 }
 int __init __weak arch_early_irq_init(void)
 {
        return 0;
 }
+#endif
-int __weak arch_init_chip_data(struct irq_desc *desc, int node)
-{
-        return 0;
-}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Detect Soft Lockups
- *
- * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
- *
- * this code detects soft lockups: incidents in where on a CPU
- * the kernel does not reschedule for 10 seconds or more.
- */
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
-#include <linux/module.h>
-#include <linux/sysctl.h>
-#include <asm/irq_regs.h>
-static DEFINE_SPINLOCK(print_lock);
-static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
-static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
-static DEFINE_PER_CPU(bool, softlock_touch_sync);
-static int __read_mostly did_panic;
-int __read_mostly softlockup_thresh = 60;
-/*
- * Should we panic (and reboot, if panic_timeout= is set) when a
- * soft-lockup occurs:
- */
-unsigned int __read_mostly softlockup_panic =
-                                CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-static int __init softlockup_panic_setup(char *str)
-{
-        softlockup_panic = simple_strtoul(str, NULL, 0);
-        return 1;
-}
-__setup("softlockup_panic=", softlockup_panic_setup);
-static int
-softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-        did_panic = 1;
-        return NOTIFY_DONE;
-}
-static struct notifier_block panic_block = {
-        .notifier_call = softlock_panic,
-};
-/*
- * Returns seconds, approximately.  We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(int this_cpu)
-{
-        return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
-}
-static void __touch_softlockup_watchdog(void)
-{
-        int this_cpu = raw_smp_processor_id();
-        __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
-}
-void touch_softlockup_watchdog(void)
-{
-        __raw_get_cpu_var(softlockup_touch_ts) = 0;
-}
-EXPORT_SYMBOL(touch_softlockup_watchdog);
-void touch_softlockup_watchdog_sync(void)
-{
-        __raw_get_cpu_var(softlock_touch_sync) = true;
-        __raw_get_cpu_var(softlockup_touch_ts) = 0;
-}
-void touch_all_softlockup_watchdogs(void)
-{
-        int cpu;
-        /* Cause each CPU to re-update its timestamp rather than complain */
-        for_each_online_cpu(cpu)
-                per_cpu(softlockup_touch_ts, cpu) = 0;
-}
-EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
-int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-                             void __user *buffer,
-                             size_t *lenp, loff_t *ppos)
-{
-        touch_all_softlockup_watchdogs();
-        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-/*
- * This callback runs from the timer interrupt, and checks
- * whether the watchdog thread has hung or not:
- */
-void softlockup_tick(void)
-{
-        int this_cpu = smp_processor_id();
-        unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
-        unsigned long print_ts;
-        struct pt_regs *regs = get_irq_regs();
-        unsigned long now;
-        /* Is detection switched off? */
-        if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
-                /* Be sure we don't false trigger if switched back on */
-                if (touch_ts)
-                        per_cpu(softlockup_touch_ts, this_cpu) = 0;
-                return;
-        }
-        if (touch_ts == 0) {
-                if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
-                        /*
-                         * If the time stamp was touched atomically
-                         * make sure the scheduler tick is up to date.
-                         */
-                        per_cpu(softlock_touch_sync, this_cpu) = false;
-                        sched_clock_tick();
-                }
-                __touch_softlockup_watchdog();
-                return;
-        }
-        print_ts = per_cpu(softlockup_print_ts, this_cpu);
-        /* report at most once a second */
-        if (print_ts == touch_ts || did_panic)
-                return;
-        /* do not print during early bootup: */
-        if (unlikely(system_state != SYSTEM_RUNNING)) {
-                __touch_softlockup_watchdog();
-                return;
-        }
-        now = get_timestamp(this_cpu);
-        /*
-         * Wake up the high-prio watchdog task twice per
-         * threshold timespan.
-         */
-        if (time_after(now - softlockup_thresh/2, touch_ts))
-                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
-        /* Warn about unreasonable delays: */
-        if (time_before_eq(now - softlockup_thresh, touch_ts))
-                return;
-        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
-        spin_lock(&print_lock);
-        printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
-                        this_cpu, now - touch_ts,
-                        current->comm, task_pid_nr(current));
-        print_modules();
-        print_irqtrace_events(current);
-        if (regs)
-                show_regs(regs);
-        else
-                dump_stack();
-        spin_unlock(&print_lock);
-        if (softlockup_panic)
-                panic("softlockup: hung tasks");
-}
-/*
- * The watchdog thread - runs every second and touches the timestamp.
- */
-static int watchdog(void *__bind_cpu)
-{
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-        sched_setscheduler(current, SCHED_FIFO, &param);
-        /* initialize timestamp */
-        __touch_softlockup_watchdog();
-        set_current_state(TASK_INTERRUPTIBLE);
-        /*
-         * Run briefly once per second to reset the softlockup timestamp.
-         * If this gets delayed for more than 60 seconds then the
-         * debug-printout triggers in softlockup_tick().
-         */
-        while (!kthread_should_stop()) {
-                __touch_softlockup_watchdog();
-                schedule();
-                if (kthread_should_stop())
-                        break;
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        __set_current_state(TASK_RUNNING);
-        return 0;
-}
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-        int hotcpu = (unsigned long)hcpu;
-        struct task_struct *p;
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
-                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
-                if (IS_ERR(p)) {
-                        printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
-                        return NOTIFY_BAD;
-                }
-                per_cpu(softlockup_touch_ts, hotcpu) = 0;
-                per_cpu(softlockup_watchdog, hotcpu) = p;
-                kthread_bind(p, hotcpu);
-                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                if (!per_cpu(softlockup_watchdog, hotcpu))
-                        break;
-                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
-                             cpumask_any(cpu_online_mask));
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                p = per_cpu(softlockup_watchdog, hotcpu);
-                per_cpu(softlockup_watchdog, hotcpu) = NULL;
-                kthread_stop(p);
-                break;
-#endif /* CONFIG_HOTPLUG_CPU */
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block __cpuinitdata cpu_nfb = {
-        .notifier_call = cpu_callback
-};
-static int __initdata nosoftlockup;
-static int __init nosoftlockup_setup(char *str)
-{
-        nosoftlockup = 1;
-        return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-static int __init spawn_softlockup_task(void)
-{
-        void *cpu = (void *)(long)smp_processor_id();
-        int err;
-        if (nosoftlockup)
-                return 0;
-        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        if (err == NOTIFY_BAD) {
-                BUG();
-                return 1;
-        }
-        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
-        register_cpu_notifier(&cpu_nfb);
-        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
-        return 0;
-}
-early_initcall(spawn_softlockup_task);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 int __init_srcu_struct(struct srcu_struct *sp, const char *name,
                       struct lock_class_key *key)
 {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* Don't re-initialize a lock while it is held. */
        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
        lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
        return init_srcu_struct_fields(sp);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
 /* the actual stopper, one per every possible cpu, enabled on online cpus */
 struct cpu_stopper {
        spinlock_t              lock;
+        bool                    enabled;        /* is this stopper enabled? */
        struct list_head        works;          /* list of pending works */
        struct task_struct      *thread;        /* stopper thread */
-        bool                    enabled;        /* is this stopper enabled? */
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
@@ -262,7 +262,7 @@ repeat:
                cpu_stop_fn_t fn = work->fn;
                void *arg = work->arg;
                struct cpu_stop_done *done = work->done;
-                char ksym_buf[KSYM_NAME_LEN];
+                char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
                __set_current_state(TASK_RUNNING);
@@ -287,11 +287,12 @@ repeat:
        goto repeat;
 }
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
 static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                                           unsigned long action, void *hcpu)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        struct task_struct *p;
@@ -303,14 +304,14 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
                                   cpu);
                if (IS_ERR(p))
-                        return NOTIFY_BAD;
+                        return notifier_from_errno(PTR_ERR(p));
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
                get_task_struct(p);
+                kthread_bind(p, cpu);
+                sched_set_stop_task(cpu, p);
                stopper->thread = p;
                break;
        case CPU_ONLINE:
-                kthread_bind(stopper->thread, cpu);
                /* strictly unnecessary, as first user will wake it */
                wake_up_process(stopper->thread);
                /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
        {
                struct cpu_stop_work *work;
+                sched_set_stop_task(cpu, NULL);
                /* kill the stopper */
                kthread_stop(stopper->thread);
                /* drain remaining works */
@@ -370,7 +372,7 @@ static int __init cpu_stop_init(void)
        /* start one for the boot cpu */
        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
                                    bcpu);
-        BUG_ON(err == NOTIFY_BAD);
+        BUG_ON(err != NOTIFY_OK);
        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
        register_cpu_notifier(&cpu_stop_cpu_notifier);
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..7f5a0cd296a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
                pgid = pid;
        if (pgid < 0)
                return -EINVAL;
+        rcu_read_lock();
        /* From this point forward we keep holding onto the tasklist lock
         * so that our parent does not change from under us. -DaveM
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 out:
        /* All paths lead to here, thus we are safe. -DaveM */
        write_unlock_irq(&tasklist_lock);
+        rcu_read_unlock();
        return err;
 }
@@ -1236,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 {
-        if (resource >= RLIM_NLIMITS)
+        struct rlimit value;
-                return -EINVAL;
+        int ret;
-        else {
-                struct rlimit value;
+        ret = do_prlimit(current, resource, NULL, &value);
-                task_lock(current->group_leader);
+        if (!ret)
-                value = current->signal->rlim[resource];
+                ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
-                task_unlock(current->group_leader);
-                return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+        return ret;
-        }
 }
 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1272,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 #endif
-SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+static inline bool rlim64_is_infinity(__u64 rlim64)
 {
-        struct rlimit new_rlim, *old_rlim;
+#if BITS_PER_LONG < 64
-        int retval;
+        return rlim64 >= ULONG_MAX;
+#else
+        return rlim64 == RLIM64_INFINITY;
+#endif
+}
+static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
+{
+        if (rlim->rlim_cur == RLIM_INFINITY)
+                rlim64->rlim_cur = RLIM64_INFINITY;
+        else
+                rlim64->rlim_cur = rlim->rlim_cur;
+        if (rlim->rlim_max == RLIM_INFINITY)
+                rlim64->rlim_max = RLIM64_INFINITY;
+        else
+                rlim64->rlim_max = rlim->rlim_max;
+}
+static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
+{
+        if (rlim64_is_infinity(rlim64->rlim_cur))
+                rlim->rlim_cur = RLIM_INFINITY;
+        else
+                rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
+        if (rlim64_is_infinity(rlim64->rlim_max))
+                rlim->rlim_max = RLIM_INFINITY;
+        else
+                rlim->rlim_max = (unsigned long)rlim64->rlim_max;
+}
+/* make sure you are allowed to change @tsk limits before calling this */
+int do_prlimit(struct task_struct *tsk, unsigned int resource,
+                struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+        struct rlimit *rlim;
+        int retval = 0;
        if (resource >= RLIM_NLIMITS)
                return -EINVAL;
-        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+        if (new_rlim) {
-                return -EFAULT;
+                if (new_rlim->rlim_cur > new_rlim->rlim_max)
-        if (new_rlim.rlim_cur > new_rlim.rlim_max)
+                        return -EINVAL;
-                return -EINVAL;
+                if (resource == RLIMIT_NOFILE &&
-        old_rlim = current->signal->rlim + resource;
+                                new_rlim->rlim_max > sysctl_nr_open)
-        if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+                        return -EPERM;
-            !capable(CAP_SYS_RESOURCE))
-                return -EPERM;
-        if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-                return -EPERM;
-        retval = security_task_setrlimit(resource, &new_rlim);
-        if (retval)
-                return retval;
-        if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
-                /*
-                 * The caller is asking for an immediate RLIMIT_CPU
-                 * expiry.  But we use the zero value to mean "it was
-                 * never set".  So let's cheat and make it one second
-                 * instead
-                 */
-                new_rlim.rlim_cur = 1;
        }
-        task_lock(current->group_leader);
+        /* protect tsk->signal and tsk->sighand from disappearing */
-        *old_rlim = new_rlim;
+        read_lock(&tasklist_lock);
-        task_unlock(current->group_leader);
+        if (!tsk->sighand) {
+                retval = -ESRCH;
-        if (resource != RLIMIT_CPU)
                goto out;
+        }
+        rlim = tsk->signal->rlim + resource;
+        task_lock(tsk->group_leader);
+        if (new_rlim) {
+                if (new_rlim->rlim_max > rlim->rlim_max &&
+                                !capable(CAP_SYS_RESOURCE))
+                        retval = -EPERM;
+                if (!retval)
+                        retval = security_task_setrlimit(tsk->group_leader,
+                                        resource, new_rlim);
+                if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
+                        /*
+                         * The caller is asking for an immediate RLIMIT_CPU
+                         * expiry.  But we use the zero value to mean "it was
+                         * never set".  So let's cheat and make it one second
+                         * instead
+                         */
+                        new_rlim->rlim_cur = 1;
+                }
+        }
+        if (!retval) {
+                if (old_rlim)
+                        *old_rlim = *rlim;
+                if (new_rlim)
+                        *rlim = *new_rlim;
+        }
+        task_unlock(tsk->group_leader);
        /*
         * RLIMIT_CPU handling.   Note that the kernel fails to return an error
@@ -1317,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
         * very long-standing error, and fixing it now risks breakage of
         * applications, so we live with it
         */
-        if (new_rlim.rlim_cur == RLIM_INFINITY)
+         if (!retval && new_rlim && resource == RLIMIT_CPU &&
-                goto out;
+                         new_rlim->rlim_cur != RLIM_INFINITY)
+                update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-        update_rlimit_cpu(new_rlim.rlim_cur);
 out:
+        read_unlock(&tasklist_lock);
+        return retval;
+}
+/* rcu lock must be held */
+static int check_prlimit_permission(struct task_struct *task)
+{
+        const struct cred *cred = current_cred(), *tcred;
+        tcred = __task_cred(task);
+        if ((cred->uid != tcred->euid ||
+             cred->uid != tcred->suid ||
+             cred->uid != tcred->uid  ||
+             cred->gid != tcred->egid ||
+             cred->gid != tcred->sgid ||
+             cred->gid != tcred->gid) &&
+             !capable(CAP_SYS_RESOURCE)) {
+                return -EPERM;
+        }
        return 0;
 }
+SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
+                const struct rlimit64 __user *, new_rlim,
+                struct rlimit64 __user *, old_rlim)
+{
+        struct rlimit64 old64, new64;
+        struct rlimit old, new;
+        struct task_struct *tsk;
+        int ret;
+        if (new_rlim) {
+                if (copy_from_user(&new64, new_rlim, sizeof(new64)))
+                        return -EFAULT;
+                rlim64_to_rlim(&new64, &new);
+        }
+        rcu_read_lock();
+        tsk = pid ? find_task_by_vpid(pid) : current;
+        if (!tsk) {
+                rcu_read_unlock();
+                return -ESRCH;
+        }
+        ret = check_prlimit_permission(tsk);
+        if (ret) {
+                rcu_read_unlock();
+                return ret;
+        }
+        get_task_struct(tsk);
+        rcu_read_unlock();
+        ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+                        old_rlim ? &old : NULL);
+        if (!ret && old_rlim) {
+                rlim_to_rlim64(&old, &old64);
+                if (copy_to_user(old_rlim, &old64, sizeof(old64)))
+                        ret = -EFAULT;
+        }
+        put_task_struct(tsk);
+        return ret;
+}
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+        struct rlimit new_rlim;
+        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+                return -EFAULT;
+        return do_prlimit(current, resource, &new_rlim, NULL);
+}
 /*
 * It would make sense to put struct rusage in the task_struct,
 * except that would make the task_struct be *really big*.  After
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..c782fe9924c7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
 cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recv);
 cond_syscall(compat_sys_recvfrom);
 cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
@@ -181,3 +182,7 @@ cond_syscall(sys_eventfd2);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..c33a1edb799f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,16 +44,17 @@
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
+#include <linux/dnotify.h>
 #include <linux/syscalls.h>
 #include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
-#include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -76,15 +77,16 @@
 #include <scsi/sg.h>
 #endif
+#ifdef CONFIG_LOCKUP_DETECTOR
+#include <linux/nmi.h>
+#endif
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled;
 #endif
 /* Constants used for minimum and  maximum */
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
 static int neg_one = -1;
 #endif
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
+#ifdef CONFIG_INOTIFY_USER
+#include <linux/inotify.h>
+#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -156,8 +161,6 @@ extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
-extern struct ratelimit_state printk_ratelimit_state;
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -206,9 +209,6 @@ static struct ctl_table fs_table[];
 static struct ctl_table debug_table[];
 static struct ctl_table dev_table[];
 extern struct ctl_table random_table[];
-#ifdef CONFIG_INOTIFY_USER
-extern struct ctl_table inotify_table[];
-#endif
 #ifdef CONFIG_EPOLL
 extern struct ctl_table epoll_table[];
 #endif
@@ -562,7 +562,7 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#ifdef CONFIG_HOTPLUG
        {
                .procname       = "hotplug",
                .data           = &uevent_helper,
@@ -710,7 +710,34 @@ static struct ctl_table kern_table[] = {
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+#if defined(CONFIG_LOCKUP_DETECTOR)
+        {
+                .procname       = "watchdog",
+                .data           = &watchdog_enabled,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = proc_dowatchdog_enabled,
+        },
+        {
+                .procname       = "watchdog_thresh",
+                .data           = &softlockup_thresh,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dowatchdog_thresh,
+                .extra1         = &neg_one,
+                .extra2         = &sixty,
+        },
+        {
+                .procname       = "softlockup_panic",
+                .data           = &softlockup_panic,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "unknown_nmi_panic",
                .data           = &unknown_nmi_panic,
@@ -813,26 +840,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
-        {
-                .procname       = "softlockup_panic",
-                .data           = &softlockup_panic,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-                .extra2         = &one,
-        },
-        {
-                .procname       = "softlockup_thresh",
-                .data           = &softlockup_thresh,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dosoftlockup_thresh,
-                .extra1         = &neg_one,
-                .extra2         = &sixty,
-        },
-#endif
 #ifdef CONFIG_DETECT_HUNG_TASK
        {
                .procname       = "hung_task_panic",
@@ -906,13 +913,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-#ifdef CONFIG_SLOW_WORK
-        {
-                .procname       = "slow-work",
-                .mode           = 0555,
-                .child          = slow_work_sysctls,
-        },
-#endif
 #ifdef CONFIG_PERF_EVENTS
        {
                .procname       = "perf_event_paranoid",
@@ -1338,28 +1338,28 @@ static struct ctl_table fs_table[] = {
                .data           = &inodes_stat,
                .maxlen         = 2*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "inode-state",
                .data           = &inodes_stat,
                .maxlen         = 7*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "file-nr",
                .data           = &files_stat,
-                .maxlen         = 3*sizeof(int),
+                .maxlen         = sizeof(files_stat),
                .mode           = 0444,
                .proc_handler   = proc_nr_files,
        },
        {
                .procname       = "file-max",
                .data           = &files_stat.max_files,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(files_stat.max_files),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_doulongvec_minmax,
        },
        {
                .procname       = "nr_open",
@@ -1375,7 +1375,7 @@ static struct ctl_table fs_table[] = {
                .data           = &dentry_stat,
                .maxlen         = 6*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_dentry,
        },
        {
                .procname       = "overflowuid",
@@ -1711,10 +1711,7 @@ static __init int sysctl_init(void)
 {
        sysctl_set_parent(NULL, root_table);
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-        {
+        sysctl_check_table(current->nsproxy, root_table);
-                int err;
-                err = sysctl_check_table(current->nsproxy, root_table);
-        }
 #endif
        return 0;
 }
@@ -2486,7 +2483,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
                kbuf[left] = 0;
        }
-        for (; left && vleft--; i++, min++, max++, first=0) {
+        for (; left && vleft--; i++, first = 0) {
                unsigned long val;
                if (write) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c827..10b90d8a03c4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                if (!table->maxlen)
                                        set_fail(&fail, table, "No maxlen");
                        }
-                        if ((table->proc_handler == proc_doulongvec_minmax) ||
-                            (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-                                if (table->maxlen > sizeof (unsigned long)) {
-                                        if (!table->extra1)
-                                                set_fail(&fail, table, "No min");
-                                        if (!table->extra2)
-                                                set_fail(&fail, table, "No max");
-                                }
-                        }
 #ifdef CONFIG_PROC_SYSCTL
                        if (table->procname && !table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
        up_write(&listeners->sem);
 }
-static int fill_pid(pid_t pid, struct task_struct *tsk,
+static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
-                struct taskstats *stats)
 {
-        int rc = 0;
-        if (!tsk) {
-                rcu_read_lock();
-                tsk = find_task_by_vpid(pid);
-                if (tsk)
-                        get_task_struct(tsk);
-                rcu_read_unlock();
-                if (!tsk)
-                        return -ESRCH;
-        } else
-                get_task_struct(tsk);
        memset(stats, 0, sizeof(*stats));
        /*
         * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
        /* fill in extended acct fields */
        xacct_add_tsk(stats, tsk);
+}
-        /* Define err: label here if needed */
+static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
-        put_task_struct(tsk);
+{
-        return rc;
+        struct task_struct *tsk;
+        rcu_read_lock();
+        tsk = find_task_by_vpid(pid);
+        if (tsk)
+                get_task_struct(tsk);
+        rcu_read_unlock();
+        if (!tsk)
+                return -ESRCH;
+        fill_stats(tsk, stats);
+        put_task_struct(tsk);
+        return 0;
 }
-static int fill_tgid(pid_t tgid, struct task_struct *first,
+static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
-                struct taskstats *stats)
 {
-        struct task_struct *tsk;
+        struct task_struct *tsk, *first;
        unsigned long flags;
        int rc = -ESRCH;
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
         * leaders who are already counted with the dead tasks
         */
        rcu_read_lock();
-        if (!first)
+        first = find_task_by_vpid(tgid);
-                first = find_task_by_vpid(tgid);
        if (!first || !lock_task_sighand(first, &flags))
                goto out;
@@ -268,7 +263,6 @@ out:
        return rc;
 }
 static void fill_tgid_exit(struct task_struct *tsk)
 {
        unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
        struct nlattr *na, *ret;
        int aggr;
+        /* If we don't pad, we end up with alignment on a 4 byte boundary.
+         * This causes lots of runtime warnings on systems requiring 8 byte
+         * alignment */
+        u32 pids[2] = { pid, 0 };
+        int pid_size = ALIGN(sizeof(pid), sizeof(long));
        aggr = (type == TASKSTATS_TYPE_PID)
                        ? TASKSTATS_TYPE_AGGR_PID
                        : TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
        na = nla_nest_start(skb, aggr);
        if (!na)
                goto err;
-        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+        if (nla_put(skb, type, pid_size, pids) < 0)
                goto err;
        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
        if (!ret)
@@ -424,39 +424,46 @@ err:
        return rc;
 }
-static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+static int cmd_attr_register_cpumask(struct genl_info *info)
 {
-        int rc;
-        struct sk_buff *rep_skb;
-        struct taskstats *stats;
-        size_t size;
        cpumask_var_t mask;
+        int rc;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
        if (rc < 0)
-                goto free_return_rc;
+                goto out;
-        if (rc == 0) {
+        rc = add_del_listener(info->snd_pid, mask, REGISTER);
-                rc = add_del_listener(info->snd_pid, mask, REGISTER);
+out:
-                goto free_return_rc;
+        free_cpumask_var(mask);
-        }
+        return rc;
+}
+static int cmd_attr_deregister_cpumask(struct genl_info *info)
+{
+        cpumask_var_t mask;
+        int rc;
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+                return -ENOMEM;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
        if (rc < 0)
-                goto free_return_rc;
+                goto out;
-        if (rc == 0) {
+        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
-                rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+out:
-free_return_rc:
-                free_cpumask_var(mask);
-                return rc;
-        }
        free_cpumask_var(mask);
+        return rc;
+}
+static int cmd_attr_pid(struct genl_info *info)
+{
+        struct taskstats *stats;
+        struct sk_buff *rep_skb;
+        size_t size;
+        u32 pid;
+        int rc;
-        /*
-         * Size includes space for nested attributes
-         */
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
@@ -465,33 +472,64 @@ free_return_rc:
                return rc;
        rc = -EINVAL;
-        if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+        pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-                u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
-                stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+        if (!stats)
-                if (!stats)
+                goto err;
-                        goto err;
+        rc = fill_stats_for_pid(pid, stats);
-                rc = fill_pid(pid, NULL, stats);
+        if (rc < 0)
-                if (rc < 0)
+                goto err;
-                        goto err;
+        return send_reply(rep_skb, info);
-        } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+err:
-                u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+        nlmsg_free(rep_skb);
-                stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+        return rc;
-                if (!stats)
+}
-                        goto err;
+static int cmd_attr_tgid(struct genl_info *info)
-                rc = fill_tgid(tgid, NULL, stats);
+{
-                if (rc < 0)
+        struct taskstats *stats;
-                        goto err;
+        struct sk_buff *rep_skb;
-        } else
+        size_t size;
+        u32 tgid;
+        int rc;
+        size = nla_total_size(sizeof(u32)) +
+                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+        if (rc < 0)
+                return rc;
+        rc = -EINVAL;
+        tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+        if (!stats)
                goto err;
+        rc = fill_stats_for_tgid(tgid, stats);
+        if (rc < 0)
+                goto err;
        return send_reply(rep_skb, info);
 err:
        nlmsg_free(rep_skb);
        return rc;
 }
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+        if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
+                return cmd_attr_register_cpumask(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
+                return cmd_attr_deregister_cpumask(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
+                return cmd_attr_pid(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
+                return cmd_attr_tgid(info);
+        else
+                return -EINVAL;
+}
 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
 {
        struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (!stats)
                goto err;
-        rc = fill_pid(-1, tsk, stats);
+        fill_stats(tsk, stats);
-        if (rc < 0)
-                goto err;
        /*
         * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
        int ret;
        struct kprobe *kps[2] = {&kp, &kp2};
-        kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        kp.addr = NULL;
+        kp.flags = 0;
        ret = register_kprobes(kps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
        int ret;
        struct jprobe *jps[2] = {&jp, &jp2};
-        jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        jp.kp.addr = NULL;
+        jp.kp.flags = 0;
        ret = register_jprobes(jps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
        int ret;
        struct kretprobe *rps[2] = {&rp, &rp2};
-        rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        rp.kp.addr = NULL;
+        rp.kp.flags = 0;
        ret = register_kretprobes(rps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/time.c b/kernel/time.c
index 848b1c2ab09a..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
 }
 EXPORT_SYMBOL(timespec_trunc);
-#ifndef CONFIG_GENERIC_TIME
-/*
- * Simulate gettimeofday using do_gettimeofday which only allows a timeval
- * and therefore only yields usec accuracy
- */
-void getnstimeofday(struct timespec *tv)
-{
-        struct timeval x;
-        do_gettimeofday(&x);
-        tv->tv_sec = x.tv_sec;
-        tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
-}
-EXPORT_SYMBOL_GPL(getnstimeofday);
-#endif
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
 config NO_HZ
        bool "Tickless System (Dynamic Ticks)"
-        depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
        select TICK_ONESHOT
        help
          This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
 config HIGH_RES_TIMERS
        bool "High Resolution Timer Support"
-        depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
        select TICK_ONESHOT
        help
          This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
        return max_nsecs - (max_nsecs >> 5);
 }
-#ifdef CONFIG_GENERIC_TIME
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 /**
 * clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
        }
 }
-#else /* CONFIG_GENERIC_TIME */
+#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
 static inline void clocksource_select(void) { }
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
 #define MAX_UPDATE_LENGTH 5 /* Seconds */
 /**
- * __clocksource_register_scale - Used to install new clocksources
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
 * @t:          clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
 *
- * Returns -EBUSY if registration fails, zero otherwise.
+ * This should only be called from the clocksource->enable() method.
 *
 * This *SHOULD NOT* be called directly! Please use the
- * clocksource_register_hz() or clocksource_register_khz helper functions.
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
 */
-int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        /*
         * Ideally we want to use  some of the limits used in
         * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
                                      NSEC_PER_SEC/scale,
                                      MAX_UPDATE_LENGTH*scale);
        cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t:          clocksource to be registered
+ * @scale:      Scale factor multiplied against freq to get clocksource hz
+ * @freq:       clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+        /* Intialize mult/shift and max_idle_ns */
+        __clocksource_updatefreq_scale(cs, scale, freq);
+        /* Add clocksource to the clcoksource list */
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_select();
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..d2321891538f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
        time_reftime = get_seconds();
        offset64    = offset;
-        freq_adj    = (offset64 * secs) <<
+        freq_adj    = ntp_update_offset_fll(offset64, secs);
-                        (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
-        freq_adj    += ntp_update_offset_fll(offset64, secs);
+        /*
+         * Clamp update interval to reduce PLL gain with low
+         * sampling rate (e.g. intermittent network connection)
+         * to avoid instability.
+         */
+        if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+                secs = 1 << (SHIFT_PLL + 1 + time_constant);
+        freq_adj    += (offset64 * secs) <<
+                        (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
        freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * Setup the next period for devices, which do not have
         * periodic mode. We read dev->next_event first and add to it
-         * when the event alrady expired. clockevents_program_event()
+         * when the event already expired. clockevents_program_event()
         * sets dev->next_event only when the event is really
         * programmed to the device.
         */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        } while (read_seqretry(&xtime_lock, seq));
        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-            arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
+            arch_needs_cpu(cpu)) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
        } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * the scheduler tick in nohz_restart_sched_tick.
                 */
                if (!ts->tick_stopped) {
-                        if (select_nohz_load_balancer(1)) {
+                        select_nohz_load_balancer(1);
-                                /*
-                                 * sched tick not stopped!
-                                 */
-                                cpumask_clear_cpu(cpu, nohz_cpu_mask);
-                                goto out;
-                        }
                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t now = ktime_get();
-        u64 offset;
        /*
         * Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-        offset = ktime_to_ns(tick_period) >> 1;
-        do_div(offset, num_possible_cpus());
-        offset *= smp_processor_id();
-        hrtimer_add_expires_ns(&ts->sched_timer, offset);
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..49010d822f72 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 * - wall_to_monotonic is no longer the boot time, getboottime must be
 * used instead.
 */
-struct timespec xtime __attribute__ ((aligned (16)));
+static struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static struct timespec total_sleep_time;
 /*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
 {
        xtime.tv_sec += leapsecond;
        wall_to_monotonic.tv_sec -= leapsecond;
-        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                        timekeeper.mult);
 }
-#ifdef CONFIG_GENERIC_TIME
 /**
 * timekeeping_forward_now - update clock to the current time
 *
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
        timekeeper.ntp_error = 0;
        ntp_clear();
-        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
        tick_clock_notify();
 }
-#else /* GENERIC_TIME */
-static inline void timekeeping_forward_now(void) { }
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
-        struct timespec now;
-        ktime_get_ts(&now);
-        return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts:         pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
-        struct timespec tomono;
-        unsigned long seq;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                getnstimeofday(ts);
-                tomono = wall_to_monotonic;
-        } while (read_seqretry(&xtime_lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
-                                ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-#endif /* !GENERIC_TIME */
 /**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 *
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                xtime = timespec_add_safe(xtime, ts);
+                xtime = timespec_add(xtime, ts);
                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-                total_sleep_time = timespec_add_safe(total_sleep_time, ts);
+                total_sleep_time = timespec_add(total_sleep_time, ts);
        }
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
 static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 {
        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+        u64 raw_nsecs;
        /* If the offset is smaller then a shifted interval, do nothing */
        if (offset < timekeeper.cycle_interval<<shift)
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
                second_overflow();
        }
-        /* Accumulate into raw time */
+        /* Accumulate raw time */
-        raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+        raw_nsecs = timekeeper.raw_interval << shift;
-        while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+        raw_nsecs += raw_time.tv_nsec;
-                raw_time.tv_nsec -= NSEC_PER_SEC;
+        if (raw_nsecs >= NSEC_PER_SEC) {
-                raw_time.tv_sec++;
+                u64 raw_secs = raw_nsecs;
+                raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
+                raw_time.tv_sec += raw_secs;
        }
+        raw_time.tv_nsec = raw_nsecs;
        /* Accumulate error between NTP and clock interval */
        timekeeper.ntp_error += tick_length << shift;
@@ -784,10 +742,11 @@ void update_wall_time(void)
                return;
        clock = timekeeper.clock;
-#ifdef CONFIG_GENERIC_TIME
-        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-#else
        offset = timekeeper.cycle_interval;
+#else
+        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
@@ -856,7 +815,8 @@ void update_wall_time(void)
        }
        /* check to see if there is a new clocksource to use */
-        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
 }
 /**
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime);
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        *ts = timespec_add_safe(*ts, total_sleep_time);
+        *ts = timespec_add(*ts, total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void)
        return xtime;
 }
+struct timespec __get_wall_to_monotonic(void)
+{
+        return wall_to_monotonic;
+}
 struct timespec current_kernel_time(void)
 {
        struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 /*
 * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB for
+ * base in timer_list is guaranteed to be zero. Use the LSB to
- * the new flag to indicate whether the timer is deferrable
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
 */
 #define TBASE_DEFERRABLE_FLAG           (0x1)
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 /**
 * set_timer_slack - set the allowed slack for a timer
+ * @timer: the timer to be modified
 * @slack_hz: the amount of time (in jiffies) allowed for rounding
 *
 * Set the amount of time, in jiffies, that a certain timer has
@@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer,
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+                                         const char *name,
+                                         struct lock_class_key *key,
+                                         void (*function)(unsigned long),
+                                         unsigned long data)
+{
+        timer->function = function;
+        timer->data = data;
+        init_timer_on_stack_key(timer, name, key);
+        timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
 /**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
@@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        cpu = smp_processor_id();
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
-        if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+        if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
-                int preferred_cpu = get_nohz_load_balancer();
+                cpu = get_nohz_timer_target();
-                if (preferred_cpu >= 0)
-                        cpu = preferred_cpu;
-        }
 #endif
        new_base = per_cpu(tvec_bases, cpu);
@@ -1264,7 +1279,10 @@ void update_process_times(int user_tick)
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
        printk_tick();
-        perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+        if (in_irq())
+                irq_work_run();
+#endif
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
@@ -1289,7 +1307,6 @@ void run_local_timers(void)
 {
        hrtimer_run_queues();
        raise_softirq(TIMER_SOFTIRQ);
-        softlockup_tick();
 }
 /*
@@ -1750,3 +1767,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 EXPORT_SYMBOL(msleep_interruptible);
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+        ktime_t kmin;
+        unsigned long delta;
+        kmin = ktime_set(0, min * NSEC_PER_USEC);
+        delta = (max - min) * NSEC_PER_USEC;
+        return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..e04b8bcdef88 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
        help
          See Documentation/trace/ftrace-design.txt
+config HAVE_C_RECORDMCOUNT
+        bool
+        help
+          C version of recordmcount available?
 config TRACER_MAX_TRACE
        bool
@@ -121,7 +126,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER
+        select FRAME_POINTER if (!ARM_UNWIND)
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
@@ -153,7 +158,7 @@ config IRQSOFF_TRACER
        bool "Interrupts-off Latency Tracer"
        default n
        depends on TRACE_IRQFLAGS_SUPPORT
-        depends on GENERIC_TIME
+        depends on !ARCH_USES_GETTIMEOFFSET
        select TRACE_IRQFLAGS
        select GENERIC_TRACER
        select TRACER_MAX_TRACE
@@ -175,7 +180,7 @@ config IRQSOFF_TRACER
 config PREEMPT_TRACER
        bool "Preemption-off Latency Tracer"
        default n
-        depends on GENERIC_TIME
+        depends on !ARCH_USES_GETTIMEOFFSET
        depends on PREEMPT
        select GENERIC_TRACER
        select TRACER_MAX_TRACE
@@ -194,15 +199,6 @@ config PREEMPT_TRACER
          enabled. This option and the irqs-off timing option can be
          used together or separately.)
-config SYSPROF_TRACER
-        bool "Sysprof Tracer"
-        depends on X86
-        select GENERIC_TRACER
-        select CONTEXT_SWITCH_TRACER
-        help
-          This tracer provides the trace needed by the 'Sysprof' userspace
-          tool.
 config SCHED_TRACER
        bool "Scheduling Latency Tracer"
        select GENERIC_TRACER
@@ -229,23 +225,6 @@ config FTRACE_SYSCALLS
        help
          Basic tracer to catch the syscall entry and exit events.
-config BOOT_TRACER
-        bool "Trace boot initcalls"
-        select GENERIC_TRACER
-        select CONTEXT_SWITCH_TRACER
-        help
-          This tracer helps developers to optimize boot times: it records
-          the timings of the initcalls and traces key events and the identity
-          of tasks that can cause boot delays, such as context-switches.
-          Its aim is to be parsed by the scripts/bootgraph.pl tool to
-          produce pretty graphics about boot inefficiencies, giving a visual
-          representation of the delays during initcalls - but the raw
-          /debug/tracing/trace text output is readable too.
-          You must pass in initcall_debug and ftrace=initcall to the kernel
-          command line to enable this on bootup.
 config TRACE_BRANCH_PROFILING
        bool
        select GENERIC_TRACER
@@ -325,28 +304,6 @@ config BRANCH_TRACER
          Say N if unsure.
-config KSYM_TRACER
-        bool "Trace read and write access on kernel memory locations"
-        depends on HAVE_HW_BREAKPOINT
-        select TRACING
-        help
-          This tracer helps find read and write operations on any given kernel
-          symbol i.e. /proc/kallsyms.
-config PROFILE_KSYM_TRACER
-        bool "Profile all kernel memory accesses on 'watched' variables"
-        depends on KSYM_TRACER
-        help
-          This tracer profiles kernel accesses on variables watched through the
-          ksym tracer ftrace plugin. Depending upon the hardware, all read
-          and write operations on kernel variables can be monitored for
-          accesses.
-          The results will be displayed in:
-          /debugfs/tracing/profile_ksym
-          Say N if unsure.
 config STACK_TRACER
        bool "Trace max stack"
        depends on HAVE_FUNCTION_TRACER
@@ -371,37 +328,6 @@ config STACK_TRACER
          Say N if unsure.
-config KMEMTRACE
-        bool "Trace SLAB allocations"
-        select GENERIC_TRACER
-        help
-          kmemtrace provides tracing for slab allocator functions, such as
-          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
-          data is then fed to the userspace application in order to analyse
-          allocation hotspots, internal fragmentation and so on, making it
-          possible to see how well an allocator performs, as well as debug
-          and profile kernel code.
-          This requires an userspace application to use. See
-          Documentation/trace/kmemtrace.txt for more information.
-          Saying Y will make the kernel somewhat larger and slower. However,
-          if you disable kmemtrace at run-time or boot-time, the performance
-          impact is minimal (depending on the arch the kernel is built for).
-          If unsure, say N.
-config WORKQUEUE_TRACER
-        bool "Trace workqueues"
-        select GENERIC_TRACER
-        help
-          The workqueue tracer provides some statistical information
-          about each cpu workqueue thread such as the number of the
-          works inserted and executed since their creation. It can help
-          to evaluate the amount of work each of them has to perform.
-          For example it can help a developer to decide whether he should
-          choose a per-cpu workqueue instead of a singlethreaded one.
 config BLK_DEV_IO_TRACE
        bool "Support for tracing block IO actions"
        depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
-obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
-obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..bc251ed66724 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
@@ -169,9 +168,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
                                 BLK_TC_ACT(BLK_TC_WRITE) };
+#define BLK_TC_HARDBARRIER      BLK_TC_BARRIER
+#define BLK_TC_RAHEAD           BLK_TC_AHEAD
 /* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
-          (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+          (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 /*
 * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +196,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                return;
        what |= ddir_act[rw & WRITE];
-        what |= MASK_TC_BIT(rw, BARRIER);
+        what |= MASK_TC_BIT(rw, HARDBARRIER);
-        what |= MASK_TC_BIT(rw, SYNCIO);
+        what |= MASK_TC_BIT(rw, SYNC);
-        what |= MASK_TC_BIT(rw, AHEAD);
+        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
        what |= MASK_TC_BIT(rw, DISCARD);
@@ -323,6 +325,7 @@ static const struct file_operations blk_dropped_fops = {
        .owner =        THIS_MODULE,
        .open =         blk_dropped_open,
        .read =         blk_dropped_read,
+        .llseek =       default_llseek,
 };
 static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -362,6 +365,7 @@ static const struct file_operations blk_msg_fops = {
        .owner =        THIS_MODULE,
        .open =         blk_msg_open,
        .write =        blk_msg_write,
+        .llseek =       noop_llseek,
 };
 /*
@@ -549,6 +553,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+                                  dev_t dev, struct block_device *bdev,
+                                  char __user *arg)
+{
+        struct blk_user_trace_setup buts;
+        struct compat_blk_user_trace_setup cbuts;
+        int ret;
+        if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+                return -EFAULT;
+        buts = (struct blk_user_trace_setup) {
+                .act_mask = cbuts.act_mask,
+                .buf_size = cbuts.buf_size,
+                .buf_nr = cbuts.buf_nr,
+                .start_lba = cbuts.start_lba,
+                .end_lba = cbuts.end_lba,
+                .pid = cbuts.pid,
+        };
+        memcpy(&buts.name, &cbuts.name, 32);
+        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+        if (ret)
+                return ret;
+        if (copy_to_user(arg, &buts.name, 32)) {
+                blk_trace_remove(q);
+                return -EFAULT;
+        }
+        return 0;
+}
+#endif
 int blk_trace_startstop(struct request_queue *q, int start)
 {
        int ret;
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
                bdevname(bdev, b);
                ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
                break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+        case BLKTRACESETUP32:
+                bdevname(bdev, b);
+                ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+                break;
+#endif
        case BLKTRACESTART:
                start = 1;
        case BLKTRACESTOP:
@@ -661,10 +706,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        if (likely(!bt))
                return;
-        if (blk_discard_rq(rq))
+        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= (1 << BIO_RW_DISCARD);
+                rw |= REQ_DISCARD;
+        if (rq->cmd_flags & REQ_SECURE)
+                rw |= REQ_SECURE;
-        if (blk_pc_request(rq)) {
+        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                what |= BLK_TC_ACT(BLK_TC_PC);
                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
                                what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +973,7 @@ void blk_add_driver_data(struct request_queue *q,
        if (likely(!bt))
                return;
-        if (blk_pc_request(rq))
+        if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
                __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
                                BLK_TA_DRV_DATA, rq->errors, len, data);
        else
@@ -1603,10 +1651,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
        struct block_device *bdev;
        ssize_t ret = -ENXIO;
-        lock_kernel();
        bdev = bdget(part_devt(p));
        if (bdev == NULL)
-                goto out_unlock_kernel;
+                goto out;
        q = blk_trace_get_queue(bdev);
        if (q == NULL)
@@ -1634,8 +1681,7 @@ out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
 out_bdput:
        bdput(bdev);
-out_unlock_kernel:
+out:
-        unlock_kernel();
        return ret;
 }
@@ -1665,11 +1711,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
        ret = -ENXIO;
-        lock_kernel();
        p = dev_to_part(dev);
        bdev = bdget(part_devt(p));
        if (bdev == NULL)
-                goto out_unlock_kernel;
+                goto out;
        q = blk_trace_get_queue(bdev);
        if (q == NULL)
@@ -1704,8 +1749,6 @@ out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
 out_bdput:
        bdput(bdev);
-out_unlock_kernel:
-        unlock_kernel();
 out:
        return ret ? ret : count;
 }
@@ -1730,7 +1773,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
        int len = rq->cmd_len;
        unsigned char *cmd = rq->cmd;
-        if (!blk_pc_request(rq)) {
+        if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
                buf[0] = '\0';
                return;
        }
@@ -1755,21 +1798,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        if (rw & WRITE)
                rwbs[i++] = 'W';
-        else if (rw & 1 << BIO_RW_DISCARD)
+        else if (rw & REQ_DISCARD)
                rwbs[i++] = 'D';
        else if (bytes)
                rwbs[i++] = 'R';
        else
                rwbs[i++] = 'N';
-        if (rw & 1 << BIO_RW_AHEAD)
+        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
-        if (rw & 1 << BIO_RW_BARRIER)
+        if (rw & REQ_HARDBARRIER)
                rwbs[i++] = 'B';
-        if (rw & 1 << BIO_RW_SYNCIO)
+        if (rw & REQ_SYNC)
                rwbs[i++] = 'S';
-        if (rw & 1 << BIO_RW_META)
+        if (rw & REQ_META)
                rwbs[i++] = 'M';
+        if (rw & REQ_SECURE)
+                rwbs[i++] = 'E';
        rwbs[i] = '\0';
 }
@@ -1779,8 +1824,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
        int rw = rq->cmd_flags & 0x03;
        int bytes;
-        if (blk_discard_rq(rq))
+        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= (1 << BIO_RW_DISCARD);
+                rw |= REQ_DISCARD;
+        if (rq->cmd_flags & REQ_SECURE)
+                rw |= REQ_SECURE;
        bytes = blk_rq_bytes(rq);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..f3dadae83883 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
 {
        struct ftrace_profile *rec = v;
        char str[KSYM_SYMBOL_LEN];
+        int ret = 0;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-        static DEFINE_MUTEX(mutex);
        static struct trace_seq s;
        unsigned long long avg;
        unsigned long long stddev;
 #endif
+        mutex_lock(&ftrace_profile_lock);
+        /* we raced with function_profile_reset() */
+        if (unlikely(rec->counter == 0)) {
+                ret = -EBUSY;
+                goto out;
+        }
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
        seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
                do_div(stddev, (rec->counter - 1) * 1000);
        }
-        mutex_lock(&mutex);
        trace_seq_init(&s);
        trace_print_graph_duration(rec->time, &s);
        trace_seq_puts(&s, "    ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
        trace_seq_puts(&s, "    ");
        trace_print_graph_duration(stddev, &s);
        trace_print_seq(m, &s);
-        mutex_unlock(&mutex);
 #endif
        seq_putc(m, '\n');
+out:
+        mutex_unlock(&ftrace_profile_lock);
-        return 0;
+        return ret;
 }
 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -793,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
        .open           = tracing_open_generic,
        .read           = ftrace_profile_read,
        .write          = ftrace_profile_write,
+        .llseek         = default_llseek,
 };
 /* used to initialize the real stat files */
@@ -877,10 +885,8 @@ enum {
        FTRACE_ENABLE_CALLS             = (1 << 0),
        FTRACE_DISABLE_CALLS            = (1 << 1),
        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
-        FTRACE_ENABLE_MCOUNT            = (1 << 3),
+        FTRACE_START_FUNC_RET           = (1 << 3),
-        FTRACE_DISABLE_MCOUNT           = (1 << 4),
+        FTRACE_STOP_FUNC_RET            = (1 << 4),
-        FTRACE_START_FUNC_RET           = (1 << 5),
-        FTRACE_STOP_FUNC_RET            = (1 << 6),
 };
 static int ftrace_filtered;
@@ -1219,8 +1225,6 @@ static void ftrace_shutdown(int command)
 static void ftrace_startup_sysctl(void)
 {
-        int command = FTRACE_ENABLE_MCOUNT;
        if (unlikely(ftrace_disabled))
                return;
@@ -1228,23 +1232,17 @@ static void ftrace_startup_sysctl(void)
        saved_ftrace_func = NULL;
        /* ftrace_start_up is true if we want ftrace running */
        if (ftrace_start_up)
-                command |= FTRACE_ENABLE_CALLS;
+                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-        ftrace_run_update_code(command);
 }
 static void ftrace_shutdown_sysctl(void)
 {
-        int command = FTRACE_DISABLE_MCOUNT;
        if (unlikely(ftrace_disabled))
                return;
        /* ftrace_start_up is true if ftrace is running */
        if (ftrace_start_up)
-                command |= FTRACE_DISABLE_CALLS;
+                ftrace_run_update_code(FTRACE_DISABLE_CALLS);
-        ftrace_run_update_code(command);
 }
 static cycle_t          ftrace_update_time;
@@ -1361,24 +1359,29 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 struct ftrace_iterator {
-        struct ftrace_page      *pg;
+        loff_t                          pos;
-        int                     hidx;
+        loff_t                          func_pos;
-        int                     idx;
+        struct ftrace_page              *pg;
-        unsigned                flags;
+        struct dyn_ftrace               *func;
-        struct trace_parser     parser;
+        struct ftrace_func_probe        *probe;
+        struct trace_parser             parser;
+        int                             hidx;
+        int                             idx;
+        unsigned                        flags;
 };
 static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct hlist_node *hnd = v;
+        struct hlist_node *hnd = NULL;
        struct hlist_head *hhd;
-        WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
        (*pos)++;
+        iter->pos = *pos;
+        if (iter->probe)
+                hnd = &iter->probe->node;
 retry:
        if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
                return NULL;
@@ -1401,7 +1404,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
                }
        }
-        return hnd;
+        if (WARN_ON_ONCE(!hnd))
+                return NULL;
+        iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+        return iter;
 }
 static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1410,26 +1418,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
        void *p = NULL;
        loff_t l;
-        if (!(iter->flags & FTRACE_ITER_HASH))
+        if (iter->func_pos > *pos)
-                *pos = 0;
+                return NULL;
-        iter->flags |= FTRACE_ITER_HASH;
        iter->hidx = 0;
-        for (l = 0; l <= *pos; ) {
+        for (l = 0; l <= (*pos - iter->func_pos); ) {
-                p = t_hash_next(m, p, &l);
+                p = t_hash_next(m, &l);
                if (!p)
                        break;
        }
-        return p;
+        if (!p)
+                return NULL;
+        /* Only set this if we have an item */
+        iter->flags |= FTRACE_ITER_HASH;
+        return iter;
 }
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
        struct ftrace_func_probe *rec;
-        struct hlist_node *hnd = v;
-        rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+        rec = iter->probe;
+        if (WARN_ON_ONCE(!rec))
+                return -EIO;
        if (rec->ops->print)
                return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1450,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        struct dyn_ftrace *rec = NULL;
        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_next(m, v, pos);
+                return t_hash_next(m, pos);
        (*pos)++;
+        iter->pos = *pos;
        if (iter->flags & FTRACE_ITER_PRINTALL)
-                return NULL;
+                return t_hash_start(m, pos);
 retry:
        if (iter->idx >= iter->pg->index) {
@@ -1484,7 +1499,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                }
        }
-        return rec;
+        if (!rec)
+                return t_hash_start(m, pos);
+        iter->func_pos = *pos;
+        iter->func = rec;
+        return iter;
+}
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+        iter->pos = 0;
+        iter->func_pos = 0;
+        iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1495,6 +1523,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
        mutex_lock(&ftrace_lock);
        /*
+         * If an lseek was done, then reset and start from beginning.
+         */
+        if (*pos < iter->pos)
+                reset_iter_read(iter);
+        /*
         * For set_ftrace_filter reading, if we have the filter
         * off, we can short cut and just print out that all
         * functions are enabled.
@@ -1503,12 +1537,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
+                /* reset in case of seek/pread */
+                iter->flags &= ~FTRACE_ITER_HASH;
                return iter;
        }
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_start(m, pos);
+        /*
+         * Unfortunately, we need to restart at ftrace_pages_start
+         * every time we let go of the ftrace_mutex. This is because
+         * those pointers can change without the lock.
+         */
        iter->pg = ftrace_pages_start;
        iter->idx = 0;
        for (l = 0; l <= *pos; ) {
@@ -1517,10 +1558,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                        break;
        }
-        if (!p && iter->flags & FTRACE_ITER_FILTER)
+        if (!p) {
-                return t_hash_start(m, pos);
+                if (iter->flags & FTRACE_ITER_FILTER)
+                        return t_hash_start(m, pos);
-        return p;
+                return NULL;
+        }
+        return iter;
 }
 static void t_stop(struct seq_file *m, void *p)
@@ -1531,16 +1576,18 @@ static void t_stop(struct seq_file *m, void *p)
 static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_iterator *iter = m->private;
-        struct dyn_ftrace *rec = v;
+        struct dyn_ftrace *rec;
        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_show(m, v);
+                return t_hash_show(m, iter);
        if (iter->flags & FTRACE_ITER_PRINTALL) {
                seq_printf(m, "#### all functions enabled ####\n");
                return 0;
        }
+        rec = iter->func;
        if (!rec)
                return 0;
@@ -1592,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
        ret = ftrace_avail_open(inode, file);
        if (!ret) {
-                m = (struct seq_file *)file->private_data;
+                m = file->private_data;
-                iter = (struct ftrace_iterator *)m->private;
+                iter = m->private;
                iter->flags = FTRACE_ITER_FAILURES;
        }
@@ -1883,7 +1930,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
        struct hlist_head *hhd;
        struct hlist_node *n;
        unsigned long key;
-        int resched;
        key = hash_long(ip, FTRACE_HASH_BITS);
@@ -1897,12 +1943,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
         * period. This syncs the hash iteration and freeing of items
         * on the hash. rcu_read_lock is too dangerous here.
         */
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        hlist_for_each_entry_rcu(entry, n, hhd, node) {
                if (entry->ip == ip)
                        entry->ops->func(ip, parent_ip, &entry->data);
        }
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
 }
 static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -2624,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = {
        .read           = seq_read,
        .write          = ftrace_graph_write,
        .release        = ftrace_graph_release,
+        .llseek         = seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-#include <linux/kmemtrace.h>
-#include "trace_output.h"
-#include "trace.h"
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL  0x1
-static struct tracer_opt kmem_opts[] = {
-        /* Default disable the minimalistic output */
-        { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
-        { }
-};
-static struct tracer_flags kmem_tracer_flags = {
-        .val                    = 0,
-        .opts                   = kmem_opts
-};
-static struct trace_array *kmemtrace_array;
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
-                                   unsigned long call_site,
-                                   const void *ptr,
-                                   size_t bytes_req,
-                                   size_t bytes_alloc,
-                                   gfp_t gfp_flags,
-                                   int node)
-{
-        struct ftrace_event_call *call = &event_kmem_alloc;
-        struct trace_array *tr = kmemtrace_array;
-        struct kmemtrace_alloc_entry *entry;
-        struct ring_buffer_event *event;
-        event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-        if (!event)
-                return;
-        entry = ring_buffer_event_data(event);
-        tracing_generic_entry_update(&entry->ent, 0, 0);
-        entry->ent.type         = TRACE_KMEM_ALLOC;
-        entry->type_id          = type_id;
-        entry->call_site        = call_site;
-        entry->ptr              = ptr;
-        entry->bytes_req        = bytes_req;
-        entry->bytes_alloc      = bytes_alloc;
-        entry->gfp_flags        = gfp_flags;
-        entry->node             = node;
-        if (!filter_check_discard(call, entry, tr->buffer, event))
-                ring_buffer_unlock_commit(tr->buffer, event);
-        trace_wake_up();
-}
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
-                                  unsigned long call_site,
-                                  const void *ptr)
-{
-        struct ftrace_event_call *call = &event_kmem_free;
-        struct trace_array *tr = kmemtrace_array;
-        struct kmemtrace_free_entry *entry;
-        struct ring_buffer_event *event;
-        event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-        if (!event)
-                return;
-        entry   = ring_buffer_event_data(event);
-        tracing_generic_entry_update(&entry->ent, 0, 0);
-        entry->ent.type         = TRACE_KMEM_FREE;
-        entry->type_id          = type_id;
-        entry->call_site        = call_site;
-        entry->ptr              = ptr;
-        if (!filter_check_discard(call, entry, tr->buffer, event))
-                ring_buffer_unlock_commit(tr->buffer, event);
-        trace_wake_up();
-}
-static void kmemtrace_kmalloc(void *ignore,
-                              unsigned long call_site,
-                              const void *ptr,
-                              size_t bytes_req,
-                              size_t bytes_alloc,
-                              gfp_t gfp_flags)
-{
-        kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-                        bytes_req, bytes_alloc, gfp_flags, -1);
-}
-static void kmemtrace_kmem_cache_alloc(void *ignore,
-                                       unsigned long call_site,
-                                       const void *ptr,
-                                       size_t bytes_req,
-                                       size_t bytes_alloc,
-                                       gfp_t gfp_flags)
-{
-        kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-                        bytes_req, bytes_alloc, gfp_flags, -1);
-}
-static void kmemtrace_kmalloc_node(void *ignore,
-                                   unsigned long call_site,
-                                   const void *ptr,
-                                   size_t bytes_req,
-                                   size_t bytes_alloc,
-                                   gfp_t gfp_flags,
-                                   int node)
-{
-        kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-                        bytes_req, bytes_alloc, gfp_flags, node);
-}
-static void kmemtrace_kmem_cache_alloc_node(void *ignore,
-                                            unsigned long call_site,
-                                            const void *ptr,
-                                            size_t bytes_req,
-                                            size_t bytes_alloc,
-                                            gfp_t gfp_flags,
-                                            int node)
-{
-        kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-                        bytes_req, bytes_alloc, gfp_flags, node);
-}
-static void
-kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
-{
-        kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-static void kmemtrace_kmem_cache_free(void *ignore,
-                                      unsigned long call_site, const void *ptr)
-{
-        kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-static int kmemtrace_start_probes(void)
-{
-        int err;
-        err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
-        if (err)
-                return err;
-        err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-        if (err)
-                return err;
-        err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-        if (err)
-                return err;
-        err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-        if (err)
-                return err;
-        err = register_trace_kfree(kmemtrace_kfree, NULL);
-        if (err)
-                return err;
-        err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-        return err;
-}
-static void kmemtrace_stop_probes(void)
-{
-        unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
-        unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-        unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-        unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-        unregister_trace_kfree(kmemtrace_kfree, NULL);
-        unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-}
-static int kmem_trace_init(struct trace_array *tr)
-{
-        kmemtrace_array = tr;
-        tracing_reset_online_cpus(tr);
-        kmemtrace_start_probes();
-        return 0;
-}
-static void kmem_trace_reset(struct trace_array *tr)
-{
-        kmemtrace_stop_probes();
-}
-static void kmemtrace_headers(struct seq_file *s)
-{
-        /* Don't need headers for the original kmemtrace output */
-        if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-                return;
-        seq_printf(s, "#\n");
-        seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
-                        "      POINTER         NODE    CALLER\n");
-        seq_printf(s, "# FREE   |      |     |       |       "
-                        "       |   |            |        |\n");
-        seq_printf(s, "# |\n\n");
-}
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-#define KMEMTRACE_USER_ALLOC    0
-#define KMEMTRACE_USER_FREE     1
-struct kmemtrace_user_event {
-        u8                      event_id;
-        u8                      type_id;
-        u16                     event_size;
-        u32                     cpu;
-        u64                     timestamp;
-        unsigned long           call_site;
-        unsigned long           ptr;
-};
-struct kmemtrace_user_event_alloc {
-        size_t                  bytes_req;
-        size_t                  bytes_alloc;
-        unsigned                gfp_flags;
-        int                     node;
-};
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
-                      struct trace_event *event)
-{
-        struct trace_seq *s = &iter->seq;
-        struct kmemtrace_alloc_entry *entry;
-        int ret;
-        trace_assign_type(entry, iter->ent);
-        ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
-            "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-            entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
-            (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
-            (unsigned long)entry->gfp_flags, entry->node);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags,
-                     struct trace_event *event)
-{
-        struct trace_seq *s = &iter->seq;
-        struct kmemtrace_free_entry *entry;
-        int ret;
-        trace_assign_type(entry, iter->ent);
-        ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
-                               entry->type_id, (void *)entry->call_site,
-                               (unsigned long)entry->ptr);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
-                           struct trace_event *event)
-{
-        struct trace_seq *s = &iter->seq;
-        struct kmemtrace_alloc_entry *entry;
-        struct kmemtrace_user_event *ev;
-        struct kmemtrace_user_event_alloc *ev_alloc;
-        trace_assign_type(entry, iter->ent);
-        ev = trace_seq_reserve(s, sizeof(*ev));
-        if (!ev)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ev->event_id            = KMEMTRACE_USER_ALLOC;
-        ev->type_id             = entry->type_id;
-        ev->event_size          = sizeof(*ev) + sizeof(*ev_alloc);
-        ev->cpu                 = iter->cpu;
-        ev->timestamp           = iter->ts;
-        ev->call_site           = entry->call_site;
-        ev->ptr                 = (unsigned long)entry->ptr;
-        ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
-        if (!ev_alloc)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ev_alloc->bytes_req     = entry->bytes_req;
-        ev_alloc->bytes_alloc   = entry->bytes_alloc;
-        ev_alloc->gfp_flags     = entry->gfp_flags;
-        ev_alloc->node          = entry->node;
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
-                          struct trace_event *event)
-{
-        struct trace_seq *s = &iter->seq;
-        struct kmemtrace_free_entry *entry;
-        struct kmemtrace_user_event *ev;
-        trace_assign_type(entry, iter->ent);
-        ev = trace_seq_reserve(s, sizeof(*ev));
-        if (!ev)
-                return TRACE_TYPE_PARTIAL_LINE;
-        ev->event_id            = KMEMTRACE_USER_FREE;
-        ev->type_id             = entry->type_id;
-        ev->event_size          = sizeof(*ev);
-        ev->cpu                 = iter->cpu;
-        ev->timestamp           = iter->ts;
-        ev->call_site           = entry->call_site;
-        ev->ptr                 = (unsigned long)entry->ptr;
-        return TRACE_TYPE_HANDLED;
-}
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
-        struct kmemtrace_alloc_entry *entry;
-        struct trace_seq *s = &iter->seq;
-        int ret;
-        trace_assign_type(entry, iter->ent);
-        /* Alloc entry */
-        ret = trace_seq_printf(s, "  +      ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Type */
-        switch (entry->type_id) {
-        case KMEMTRACE_TYPE_KMALLOC:
-                ret = trace_seq_printf(s, "K   ");
-                break;
-        case KMEMTRACE_TYPE_CACHE:
-                ret = trace_seq_printf(s, "C   ");
-                break;
-        case KMEMTRACE_TYPE_PAGES:
-                ret = trace_seq_printf(s, "P   ");
-                break;
-        default:
-                ret = trace_seq_printf(s, "?   ");
-        }
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Requested */
-        ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Allocated */
-        ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Flags
-         * TODO: would be better to see the name of the GFP flag names
-         */
-        ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Pointer to allocated */
-        ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Node and call site*/
-        ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
-                                                 (void *)entry->call_site);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
-        struct kmemtrace_free_entry *entry;
-        struct trace_seq *s = &iter->seq;
-        int ret;
-        trace_assign_type(entry, iter->ent);
-        /* Free entry */
-        ret = trace_seq_printf(s, "  -      ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Type */
-        switch (entry->type_id) {
-        case KMEMTRACE_TYPE_KMALLOC:
-                ret = trace_seq_printf(s, "K     ");
-                break;
-        case KMEMTRACE_TYPE_CACHE:
-                ret = trace_seq_printf(s, "C     ");
-                break;
-        case KMEMTRACE_TYPE_PAGES:
-                ret = trace_seq_printf(s, "P     ");
-                break;
-        default:
-                ret = trace_seq_printf(s, "?     ");
-        }
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Skip requested/allocated/flags */
-        ret = trace_seq_printf(s, "                       ");
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Pointer to allocated */
-        ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        /* Skip node and print call site*/
-        ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
-        struct trace_entry *entry = iter->ent;
-        if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-                return TRACE_TYPE_UNHANDLED;
-        switch (entry->type) {
-        case TRACE_KMEM_ALLOC:
-                return kmemtrace_print_alloc_compress(iter);
-        case TRACE_KMEM_FREE:
-                return kmemtrace_print_free_compress(iter);
-        default:
-                return TRACE_TYPE_UNHANDLED;
-        }
-}
-static struct trace_event_functions kmem_trace_alloc_funcs = {
-        .trace                  = kmemtrace_print_alloc,
-        .binary                 = kmemtrace_print_alloc_user,
-};
-static struct trace_event kmem_trace_alloc = {
-        .type                   = TRACE_KMEM_ALLOC,
-        .funcs                  = &kmem_trace_alloc_funcs,
-};
-static struct trace_event_functions kmem_trace_free_funcs = {
-        .trace                  = kmemtrace_print_free,
-        .binary                 = kmemtrace_print_free_user,
-};
-static struct trace_event kmem_trace_free = {
-        .type                   = TRACE_KMEM_FREE,
-        .funcs                  = &kmem_trace_free_funcs,
-};
-static struct tracer kmem_tracer __read_mostly = {
-        .name                   = "kmemtrace",
-        .init                   = kmem_trace_init,
-        .reset                  = kmem_trace_reset,
-        .print_line             = kmemtrace_print_line,
-        .print_header           = kmemtrace_headers,
-        .flags                  = &kmem_tracer_flags
-};
-void kmemtrace_init(void)
-{
-        /* earliest opportunity to start kmem tracing */
-}
-static int __init init_kmem_tracer(void)
-{
-        if (!register_ftrace_event(&kmem_trace_alloc)) {
-                pr_warning("Warning: could not register kmem events\n");
-                return 1;
-        }
-        if (!register_ftrace_event(&kmem_trace_free)) {
-                pr_warning("Warning: could not register kmem events\n");
-                return 1;
-        }
-        if (register_tracer(&kmem_tracer) != 0) {
-                pr_warning("Warning: could not register the kmem tracer\n");
-                return 1;
-        }
-        return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
        RB_LEN_TIME_STAMP = 16,
 };
+#define skip_time_extend(event) \
+        ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
        return length + RB_EVNT_HDR_SIZE;
 }
-/* inline for ring buffer fast paths */
+/*
-static unsigned
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
        switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
        return 0;
 }
+/*
+ * Return total length of time extend and data,
+ *   or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+        unsigned len = 0;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                /* time extends include the data event after it */
+                len = RB_LEN_TIME_EXTEND;
+                event = skip_time_extend(event);
+        }
+        return len + rb_event_length(event);
+}
 /**
 * ring_buffer_event_length - return the length of the event
 * @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
 */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-        unsigned length = rb_event_length(event);
+        unsigned length;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
+        length = rb_event_length(event);
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
        if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE  (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
        struct buffer_data_page field;
@@ -443,6 +477,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 */
 struct ring_buffer_per_cpu {
        int                             cpu;
+        atomic_t                        record_disabled;
        struct ring_buffer              *buffer;
        spinlock_t                      reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
@@ -462,7 +497,6 @@ struct ring_buffer_per_cpu {
        unsigned long                   read;
        u64                             write_stamp;
        u64                             read_stamp;
-        atomic_t                        record_disabled;
 };
 struct ring_buffer {
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
        iter->head = 0;
 }
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+        event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+        /* Not the first event on the page? */
+        if (rb_event_index(event)) {
+                event->time_delta = delta & TS_MASK;
+                event->array[0] = delta >> TS_SHIFT;
+        } else {
+                /* nope, just zero it */
+                event->time_delta = 0;
+                event->array[0] = 0;
+        }
+        return skip_time_extend(event);
+}
 /**
 * ring_buffer_update_event - update event type and data
 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 * data field.
 */
 static void
-rb_update_event(struct ring_buffer_event *event,
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-                         unsigned type, unsigned length)
+                struct ring_buffer_event *event, unsigned length,
+                int add_timestamp, u64 delta)
 {
-        event->type_len = type;
+        /* Only a commit updates the timestamp */
+        if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-        switch (type) {
+                delta = 0;
-        case RINGBUF_TYPE_PADDING:
-        case RINGBUF_TYPE_TIME_EXTEND:
-        case RINGBUF_TYPE_TIME_STAMP:
-                break;
-        case 0:
+        /*
-                length -= RB_EVNT_HDR_SIZE;
+         * If we need to add a timestamp, then we
-                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+         * add it to the start of the resevered space.
-                        event->array[0] = length;
+         */
-                else
+        if (unlikely(add_timestamp)) {
-                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+                event = rb_add_time_stamp(event, delta);
-                break;
+                length -= RB_LEN_TIME_EXTEND;
-        default:
+                delta = 0;
-                BUG();
        }
+        event->time_delta = delta;
+        length -= RB_EVNT_HDR_SIZE;
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+                event->type_len = 0;
+                event->array[0] = length;
+        } else
+                event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 }
 /*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        local_sub(length, &tail_page->write);
 }
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-             struct buffer_page *tail_page, u64 *ts)
+             struct buffer_page *tail_page, u64 ts)
 {
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-                *ts = rb_time_stamp(buffer);
+                ts = rb_time_stamp(buffer);
-                next_page->page->time_stamp = *ts;
+                next_page->page->time_stamp = ts;
        }
 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                  unsigned type, unsigned long length, u64 *ts)
+                  unsigned long length, u64 ts,
+                  u64 delta, int add_timestamp)
 {
        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
+        /*
+         * If the time delta since the last event is too big to
+         * hold in the time field of the event, then we append a
+         * TIME EXTEND event ahead of the data event.
+         */
+        if (unlikely(add_timestamp))
+                length += RB_LEN_TIME_EXTEND;
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        tail = write - length;
        /* See if we shot pass the end of this buffer page */
-        if (write > BUF_PAGE_SIZE)
+        if (unlikely(write > BUF_PAGE_SIZE))
                return rb_move_tail(cpu_buffer, length, tail,
                                    tail_page, ts);
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-        rb_update_event(event, type, length);
+        rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
-        /* The passed in type is zero for DATA */
+        local_inc(&tail_page->entries);
-        if (likely(!type))
-                local_inc(&tail_page->entries);
        /*
         * If this is the first commit on the page, then update
         * its timestamp.
         */
        if (!tail)
-                tail_page->page->time_stamp = *ts;
+                tail_page->page->time_stamp = ts;
        return event;
 }
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        unsigned long addr;
        new_index = rb_event_index(event);
-        old_index = new_index + rb_event_length(event);
+        old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
        addr &= PAGE_MASK;
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 }
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                  u64 *ts, u64 *delta)
-{
-        struct ring_buffer_event *event;
-        int ret;
-        WARN_ONCE(*delta > (1ULL << 59),
-                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                  (unsigned long long)*delta,
-                  (unsigned long long)*ts,
-                  (unsigned long long)cpu_buffer->write_stamp);
-        /*
-         * The delta is too big, we to add a
-         * new timestamp.
-         */
-        event = __rb_reserve_next(cpu_buffer,
-                                  RINGBUF_TYPE_TIME_EXTEND,
-                                  RB_LEN_TIME_EXTEND,
-                                  ts);
-        if (!event)
-                return -EBUSY;
-        if (PTR_ERR(event) == -EAGAIN)
-                return -EAGAIN;
-        /* Only a commited time event can update the write stamp */
-        if (rb_event_is_commit(cpu_buffer, event)) {
-                /*
-                 * If this is the first on the page, then it was
-                 * updated with the page itself. Try to discard it
-                 * and if we can't just make it zero.
-                 */
-                if (rb_event_index(event)) {
-                        event->time_delta = *delta & TS_MASK;
-                        event->array[0] = *delta >> TS_SHIFT;
-                } else {
-                        /* try to discard, since we do not need this */
-                        if (!rb_try_to_discard(cpu_buffer, event)) {
-                                /* nope, just zero it */
-                                event->time_delta = 0;
-                                event->array[0] = 0;
-                        }
-                }
-                cpu_buffer->write_stamp = *ts;
-                /* let the caller know this was the commit */
-                ret = 1;
-        } else {
-                /* Try to discard the event */
-                if (!rb_try_to_discard(cpu_buffer, event)) {
-                        /* Darn, this is just wasted space */
-                        event->time_delta = 0;
-                        event->array[0] = 0;
-                }
-                ret = 0;
-        }
-        *delta = 0;
-        return ret;
-}
 static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        local_inc(&cpu_buffer->committing);
        local_inc(&cpu_buffer->commits);
 }
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        unsigned long commits;
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                      unsigned long length)
 {
        struct ring_buffer_event *event;
-        u64 ts, delta = 0;
+        u64 ts, delta;
-        int commit = 0;
        int nr_loops = 0;
+        int add_timestamp;
+        u64 diff;
        rb_start_commit(cpu_buffer);
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        length = rb_calculate_event_length(length);
 again:
+        add_timestamp = 0;
+        delta = 0;
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                goto out_fail;
        ts = rb_time_stamp(cpu_buffer->buffer);
+        diff = ts - cpu_buffer->write_stamp;
-        /*
+        /* make sure this diff is calculated here */
-         * Only the first commit can update the timestamp.
+        barrier();
-         * Yes there is a race here. If an interrupt comes in
-         * just after the conditional and it traces too, then it
-         * will also check the deltas. More than one timestamp may
-         * also be made. But only the entry that did the actual
-         * commit will be something other than zero.
-         */
-        if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-                   rb_page_write(cpu_buffer->tail_page) ==
-                   rb_commit_index(cpu_buffer))) {
-                u64 diff;
-                diff = ts - cpu_buffer->write_stamp;
-                /* make sure this diff is calculated here */
-                barrier();
-                /* Did the write stamp get updated already? */
-                if (unlikely(ts < cpu_buffer->write_stamp))
-                        goto get_event;
+        /* Did the write stamp get updated already? */
+        if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
+                        WARN_ONCE(delta > (1ULL << 59),
-                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                        if (commit == -EBUSY)
+                                  (unsigned long long)delta,
-                                goto out_fail;
+                                  (unsigned long long)ts,
+                                  (unsigned long long)cpu_buffer->write_stamp);
-                        if (commit == -EAGAIN)
+                        add_timestamp = 1;
-                                goto again;
-                        RB_WARN_ON(cpu_buffer, commit < 0);
                }
        }
- get_event:
+        event = __rb_reserve_next(cpu_buffer, length, ts,
-        event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+                                  delta, add_timestamp);
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
        if (!event)
                goto out_fail;
-        if (!rb_event_is_commit(cpu_buffer, event))
-                delta = 0;
-        event->time_delta = delta;
        return event;
 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 #define TRACE_RECURSIVE_DEPTH 16
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
 {
-        current->trace_recursion++;
-        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-                return 0;
        /* Disable all tracing before we do anything else */
        tracing_off_permanent();
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
                    in_nmi());
        WARN_ON_ONCE(1);
+}
+static inline int trace_recursive_lock(void)
+{
+        current->trace_recursion++;
+        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+                return 0;
+        trace_recursive_fail();
        return -1;
 }
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
 {
        WARN_ON_ONCE(!current->trace_recursion);
@@ -2242,8 +2232,6 @@ static void trace_recursive_unlock(void)
 #endif
-static DEFINE_PER_CPU(int, rb_need_resched);
 /**
 * ring_buffer_lock_reserve - reserve a part of the buffer
 * @buffer: the ring buffer to reserve from
@@ -2264,13 +2252,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
-        int cpu, resched;
+        int cpu;
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return NULL;
        /* If we are tracing schedule, we don't want to recurse */
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        if (atomic_read(&buffer->record_disabled))
                goto out_nocheck;
@@ -2295,21 +2283,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (!event)
                goto out;
-        /*
-         * Need to store resched state on this cpu.
-         * Only the first needs to.
-         */
-        if (preempt_count() == 1)
-                per_cpu(rb_need_resched, cpu) = resched;
        return event;
 out:
        trace_recursive_unlock();
 out_nocheck:
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
        return NULL;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2318,12 +2298,28 @@ static void
 rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
 {
+        u64 delta;
        /*
         * The event first in the commit queue updates the
         * time stamp.
         */
-        if (rb_event_is_commit(cpu_buffer, event))
+        if (rb_event_is_commit(cpu_buffer, event)) {
-                cpu_buffer->write_stamp += event->time_delta;
+                /*
+                 * A commit event that is first on a page
+                 * updates the write timestamp with the page stamp
+                 */
+                if (!rb_event_index(event))
+                        cpu_buffer->write_stamp =
+                                cpu_buffer->commit_page->page->time_stamp;
+                else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                        delta = event->array[0];
+                        delta <<= TS_SHIFT;
+                        delta += event->time_delta;
+                        cpu_buffer->write_stamp += delta;
+                } else
+                        cpu_buffer->write_stamp += event->time_delta;
+        }
 }
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2355,13 +2351,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
        trace_recursive_unlock();
-        /*
+        preempt_enable_notrace();
-         * Only the last preempt count needs to restore preemption.
-         */
-        if (preempt_count() == 1)
-                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
-        else
-                preempt_enable_no_resched_notrace();
        return 0;
 }
@@ -2369,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
        /* array[0] holds the actual length for the discarded event */
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
@@ -2469,13 +2462,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
        trace_recursive_unlock();
-        /*
+        preempt_enable_notrace();
-         * Only the last preempt count needs to restore preemption.
-         */
-        if (preempt_count() == 1)
-                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
-        else
-                preempt_enable_no_resched_notrace();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2501,12 +2488,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        void *body;
        int ret = -EBUSY;
-        int cpu, resched;
+        int cpu;
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return -EBUSY;
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        if (atomic_read(&buffer->record_disabled))
                goto out;
@@ -2536,7 +2523,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
        ret = 0;
 out:
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
        return ret;
 }
@@ -2628,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        return local_read(&cpu_buffer->entries) -
+                (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
 /**
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
 * @buffer: The ring buffer
@@ -2636,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned long ret;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 0;
        cpu_buffer = buffer->buffers[cpu];
-        ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-                - cpu_buffer->read;
-        return ret;
+        return rb_num_of_entries(cpu_buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
@@ -2706,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-                entries += (local_read(&cpu_buffer->entries) -
+                entries += rb_num_of_entries(cpu_buffer);
-                            local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
        }
        return entries;
@@ -3007,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 static void rb_advance_iter(struct ring_buffer_iter *iter)
 {
-        struct ring_buffer *buffer;
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
        unsigned length;
        cpu_buffer = iter->cpu_buffer;
-        buffer = cpu_buffer->buffer;
        /*
         * Check if we are at the end of the buffer.
@@ -3064,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 again:
        /*
-         * We repeat when a timestamp is encountered. It is possible
+         * We repeat when a time extend is encountered.
-         * to get multiple timestamps from an interrupt entering just
+         * Since the time extend is always attached to a data event,
-         * as one timestamp is about to be written, or from discarded
+         * we should never loop more than once.
-         * commits. The most that we can have is the number on a single page.
+         * (We never hit the following condition more than twice).
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
        reader = rb_get_reader_page(cpu_buffer);
@@ -3145,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
                return NULL;
        /*
-         * We repeat when a timestamp is encountered.
+         * We repeat when a time extend is encountered.
-         * We can get multiple timestamps by nested interrupts or also
+         * Since the time extend is always attached to a data event,
-         * if filtering is on (discarding commits). Since discarding
+         * we should never loop more than once.
-         * commits can be frequent we can get a lot of timestamps.
+         * (We never hit the following condition more than twice).
-         * But we limit them by not adding timestamps if they begin
-         * at the start of a page.
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
        if (rb_per_cpu_empty(cpu_buffer))
@@ -3850,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                if (len > (commit - read))
                        len = (commit - read);
-                size = rb_event_length(event);
+                /* Always keep the time extend and data together */
+                size = rb_event_ts_length(event);
                if (len < size)
                        goto out_unlock;
@@ -3868,8 +3861,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                        rpos = reader->read;
                        pos += size;
+                        if (rpos >= commit)
+                                break;
                        event = rb_reader_event(cpu_buffer);
-                        size = rb_event_length(event);
+                        /* Always keep the time extend and data together */
+                        size = rb_event_ts_length(event);
                } while (len > size);
                /* update bpage */
@@ -3986,6 +3983,7 @@ static const struct file_operations rb_simple_fops = {
        .open           = tracing_open_generic,
        .read           = rb_simple_read,
        .write          = rb_simple_write,
+        .llseek         = default_llseek,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
        preempt_enable();
 }
-static cpumask_var_t __read_mostly      tracing_buffer_mask;
+cpumask_var_t __read_mostly     tracing_buffer_mask;
-#define for_each_tracing_cpu(cpu)       \
-        for_each_cpu(cpu, tracing_buffer_mask)
 /*
 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-        TRACE_ITER_GRAPH_TIME;
+        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
        "latency-format",
        "sleep-time",
        "graph-time",
+        "record-cmd",
        NULL
 };
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
+        if (!current_trace->use_max_tr) {
+                WARN_ON_ONCE(1);
+                return;
+        }
        arch_spin_lock(&ftrace_max_lock);
        tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
+        if (!current_trace->use_max_tr) {
+                WARN_ON_ONCE(1);
+                return;
+        }
        arch_spin_lock(&ftrace_max_lock);
        ftrace_disable_cpu();
@@ -729,18 +736,11 @@ __acquires(kernel_lock)
                return -1;
        }
-        if (strlen(type->name) > MAX_TRACER_SIZE) {
+        if (strlen(type->name) >= MAX_TRACER_SIZE) {
                pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
                return -1;
        }
-        /*
-         * When this gets called we hold the BKL which means that
-         * preemption is disabled. Various trace selftests however
-         * need to disable and enable preemption for successful tests.
-         * So we drop the BKL here and grab it after the tests again.
-         */
-        unlock_kernel();
        mutex_lock(&trace_types_lock);
        tracing_selftest_running = true;
@@ -822,7 +822,6 @@ __acquires(kernel_lock)
 #endif
 out_unlock:
-        lock_kernel();
        return ret;
 }
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 #endif /* CONFIG_STACKTRACE */
-static void
-ftrace_trace_special(void *__tr,
-                     unsigned long arg1, unsigned long arg2, unsigned long arg3,
-                     int pc)
-{
-        struct ftrace_event_call *call = &event_special;
-        struct ring_buffer_event *event;
-        struct trace_array *tr = __tr;
-        struct ring_buffer *buffer = tr->buffer;
-        struct special_entry *entry;
-        event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
-                                          sizeof(*entry), 0, pc);
-        if (!event)
-                return;
-        entry   = ring_buffer_event_data(event);
-        entry->arg1                     = arg1;
-        entry->arg2                     = arg2;
-        entry->arg3                     = arg3;
-        if (!filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-void
-__trace_special(void *__tr, void *__data,
-                unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-        ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-        struct trace_array *tr = &global_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
-        int cpu;
-        int pc;
-        if (tracing_disabled)
-                return;
-        pc = preempt_count();
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        if (likely(atomic_inc_return(&data->disabled) == 1))
-                ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-        atomic_dec(&data->disabled);
-        local_irq_restore(flags);
-}
 /**
 * trace_vbprintk - write binary msg to tracing buffer
 *
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        struct bprint_entry *entry;
        unsigned long flags;
        int disable;
-        int resched;
        int cpu, len = 0, size, pc;
        if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        pause_graph_tracing();
        pc = preempt_count();
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
@@ -1452,7 +1395,7 @@ out_unlock:
 out:
        atomic_dec_return(&data->disabled);
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
        unpause_graph_tracing();
        return len;
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
-enum trace_file_type {
-        TRACE_FILE_LAT_FMT      = 1,
-        TRACE_FILE_ANNOTATE     = 2,
-};
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
        /* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 }
 /* Find the next real entry, and increment the iterator to the next entry */
-static void *find_next_entry_inc(struct trace_iterator *iter)
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
 {
        iter->ent = __find_next_entry(iter, &iter->cpu,
                                      &iter->lost_events, &iter->ts);
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
                return NULL;
        if (iter->idx < 0)
-                ent = find_next_entry_inc(iter);
+                ent = trace_find_next_entry_inc(iter);
        else
                ent = iter;
        while (ent && iter->idx < i)
-                ent = find_next_entry_inc(iter);
+                ent = trace_find_next_entry_inc(iter);
        iter->pos = *pos;
        return ent;
 }
-static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 {
        struct trace_array *tr = iter->tr;
        struct ring_buffer_event *event;
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
 }
 /*  Called with trace_event_read_lock() held. */
-static enum print_line_t print_trace_line(struct trace_iterator *iter)
+enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
@@ -2258,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 static int tracing_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
        int cpu;
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
        .open           = show_traces_open,
        .read           = seq_read,
        .release        = seq_release,
+        .llseek         = seq_lseek,
 };
 /*
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_cpumask_read,
        .write          = tracing_cpumask_write,
+        .llseek         = generic_file_llseek,
 };
 static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
                trace_flags |= mask;
        else
                trace_flags &= ~mask;
+        if (mask == TRACE_ITER_RECORD_CMD)
+                trace_event_enable_cmd_record(enabled);
 }
 static ssize_t
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_readme_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_readme_read,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
 static const struct file_operations tracing_saved_cmdlines_fops = {
    .open       = tracing_open_generic,
    .read       = tracing_saved_cmdlines_read,
+    .llseek     = generic_file_llseek,
 };
 static ssize_t
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
        if (ret < 0)
                return ret;
+        if (!current_trace->use_max_tr)
+                goto out;
        ret = ring_buffer_resize(max_tr.buffer, size);
        if (ret < 0) {
                int r;
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
                return ret;
        }
+        max_tr.entries = size;
+ out:
        global_trace.entries = size;
        return ret;
 }
 /**
 * tracing_update_buffers - used by tracing facility to expand ring buffers
 *
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
        trace_branch_disable();
        if (current_trace && current_trace->reset)
                current_trace->reset(tr);
+        if (current_trace && current_trace->use_max_tr) {
+                /*
+                 * We don't free the ring buffer. instead, resize it because
+                 * The max_tr ring buffer has some state (e.g. ring->clock) and
+                 * we want preserve it.
+                 */
+                ring_buffer_resize(max_tr.buffer, 1);
+                max_tr.entries = 1;
+        }
        destroy_trace_option_files(topts);
        current_trace = t;
        topts = create_trace_option_files(current_trace);
+        if (current_trace->use_max_tr) {
+                ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+                if (ret < 0)
+                        goto out;
+                max_tr.entries = global_trace.entries;
+        }
        if (t->init) {
                ret = tracer_init(t, tr);
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        if (iter->trace->pipe_open)
                iter->trace->pipe_open(iter);
+        nonseekable_open(inode, filp);
 out:
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -3211,7 +3177,7 @@ waitagain:
        trace_event_read_lock();
        trace_access_lock(iter->cpu_file);
-        while (find_next_entry_inc(iter) != NULL) {
+        while (trace_find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
                if (ret != TRACE_TYPE_NO_CONSUME)
                        trace_consume(iter);
                rem -= count;
-                if (!find_next_entry_inc(iter)) {
+                if (!trace_find_next_entry_inc(iter))   {
                        rem = 0;
                        iter->ent = NULL;
                        break;
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        if (ret <= 0)
                goto out_err;
-        if (!iter->ent && !find_next_entry_inc(iter)) {
+        if (!iter->ent && !trace_find_next_entry_inc(iter)) {
                ret = -EFAULT;
                goto out_err;
        }
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        }
        tracing_start();
-        max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
        return cnt;
@@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
 {
        char *buf;
+        size_t written;
        if (tracing_disabled)
                return -EINVAL;
@@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        } else
                buf[cnt] = '\0';
-        cnt = mark_printk("%s", buf);
+        written = mark_printk("%s", buf);
        kfree(buf);
-        *fpos += cnt;
+        *fpos += written;
-        return cnt;
+        /* don't tell userspace we wrote more - it might confuse them */
+        if (written > cnt)
+                written = cnt;
+        return written;
 }
 static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
        .write          = tracing_max_lat_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct file_operations tracing_ctrl_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_ctrl_read,
        .write          = tracing_ctrl_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct file_operations set_tracer_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_set_trace_read,
        .write          = tracing_set_trace_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct file_operations tracing_pipe_fops = {
@@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = {
        .read           = tracing_read_pipe,
        .splice_read    = tracing_splice_read_pipe,
        .release        = tracing_release_pipe,
+        .llseek         = no_llseek,
 };
 static const struct file_operations tracing_entries_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_entries_read,
        .write          = tracing_entries_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct file_operations tracing_mark_fops = {
        .open           = tracing_open_generic,
        .write          = tracing_mark_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct file_operations trace_clock_fops = {
@@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_stats_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_stats_read,
+        .llseek         = generic_file_llseek,
 };
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_dyn_info_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_read_dyn_info,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -4018,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu();
        struct dentry *d_cpu;
-        /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+        char cpu_dir[30]; /* 30 characters should be more than enough */
-        char cpu_dir[7];
-        if (cpu > 999 || cpu < 0)
+        snprintf(cpu_dir, 30, "cpu%ld", cpu);
-                return;
-        sprintf(cpu_dir, "cpu%ld", cpu);
        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
                pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
@@ -4115,6 +4089,7 @@ static const struct file_operations trace_options_fops = {
        .open = tracing_open_generic,
        .read = trace_options_read,
        .write = trace_options_write,
+        .llseek = generic_file_llseek,
 };
 static ssize_t
@@ -4166,6 +4141,7 @@ static const struct file_operations trace_options_core_fops = {
        .open = tracing_open_generic,
        .read = trace_options_core_read,
        .write = trace_options_core_write,
+        .llseek = generic_file_llseek,
 };
 struct dentry *trace_create_file(const char *name,
@@ -4355,9 +4331,6 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
-#ifdef CONFIG_SYSPROF_TRACER
-        init_tracer_sysprof_debugfs(d_tracer);
-#endif
        create_trace_options_dir();
@@ -4414,7 +4387,7 @@ static struct notifier_block trace_die_notifier = {
 */
 #define KERN_TRACE              KERN_EMERG
-static void
+void
 trace_printk_seq(struct trace_seq *s)
 {
        /* Probably should print a warning here. */
@@ -4429,6 +4402,13 @@ trace_printk_seq(struct trace_seq *s)
        trace_seq_init(s);
 }
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+        iter->tr = &global_trace;
+        iter->trace = current_trace;
+        iter->cpu_file = TRACE_PIPE_ALL_CPU;
+}
 static void
 __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
@@ -4454,8 +4434,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        if (disable_tracing)
                ftrace_kill();
+        trace_init_global_iter(&iter);
        for_each_tracing_cpu(cpu) {
-                atomic_inc(&global_trace.data[cpu]->disabled);
+                atomic_inc(&iter.tr->data[cpu]->disabled);
        }
        old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4486,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                iter.iter_flags |= TRACE_FILE_LAT_FMT;
                iter.pos = -1;
-                if (find_next_entry_inc(&iter) != NULL) {
+                if (trace_find_next_entry_inc(&iter) != NULL) {
                        int ret;
                        ret = print_trace_line(&iter);
@@ -4526,7 +4508,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                trace_flags |= old_userobj;
                for_each_tracing_cpu(cpu) {
-                        atomic_dec(&global_trace.data[cpu]->disabled);
+                        atomic_dec(&iter.tr->data[cpu]->disabled);
                }
                tracing_on();
        }
@@ -4575,16 +4557,14 @@ __init static int tracer_alloc_buffers(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
-        max_tr.buffer = ring_buffer_alloc(ring_buf_size,
+        max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
-                                             TRACE_BUFFER_FLAGS);
        if (!max_tr.buffer) {
                printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
                WARN_ON(1);
                ring_buffer_free(global_trace.buffer);
                goto out_free_cpumask;
        }
-        max_tr.entries = ring_buffer_size(max_tr.buffer);
+        max_tr.entries = 1;
-        WARN_ON(max_tr.entries != global_trace.entries);
 #endif
        /* Allocate the first page for all buffers */
@@ -4597,9 +4577,6 @@ __init static int tracer_alloc_buffers(void)
        register_tracer(&nop_trace);
        current_trace = &nop_trace;
-#ifdef CONFIG_BOOT_TRACER
-        register_tracer(&boot_tracer);
-#endif
        /* All seems OK, enable tracing */
        tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
-#include <trace/boot.h>
-#include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -25,30 +22,17 @@ enum trace_type {
        TRACE_STACK,
        TRACE_PRINT,
        TRACE_BPRINT,
-        TRACE_SPECIAL,
        TRACE_MMIO_RW,
        TRACE_MMIO_MAP,
        TRACE_BRANCH,
-        TRACE_BOOT_CALL,
-        TRACE_BOOT_RET,
        TRACE_GRAPH_RET,
        TRACE_GRAPH_ENT,
        TRACE_USER_STACK,
-        TRACE_KMEM_ALLOC,
-        TRACE_KMEM_FREE,
        TRACE_BLK,
-        TRACE_KSYM,
        __TRACE_LAST_TYPE,
 };
-enum kmemtrace_type_id {
-        KMEMTRACE_TYPE_KMALLOC = 0,     /* kmalloc() or kfree(). */
-        KMEMTRACE_TYPE_CACHE,           /* kmem_cache_*(). */
-        KMEMTRACE_TYPE_PAGES,           /* __get_free_pages() and friends. */
-};
-extern struct tracer boot_tracer;
 #undef __field
 #define __field(type, item)             type    item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
                IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
                IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);   \
                IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
-                IF_ASSIGN(var, ent, struct special_entry, 0);           \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,          \
                          TRACE_MMIO_RW);                               \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_map,         \
                          TRACE_MMIO_MAP);                              \
-                IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
-                IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
                IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
                IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,      \
                          TRACE_GRAPH_ENT);             \
                IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,      \
                          TRACE_GRAPH_RET);             \
-                IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,       \
-                          TRACE_KMEM_ALLOC);    \
-                IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
-                          TRACE_KMEM_FREE);     \
-                IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
                __ftrace_bad_type();                                    \
        } while (0)
@@ -298,6 +274,7 @@ struct tracer {
        struct tracer           *next;
        int                     print_max;
        struct tracer_flags     *flags;
+        int                     use_max_tr;
 };
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
                                 const struct file_operations *fops);
 struct dentry *tracing_init_dentry(void);
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 struct ring_buffer_event;
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts);
+int trace_empty(struct trace_iterator *iter);
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+void trace_init_global_iter(struct trace_iterator *iter);
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
 void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
@@ -355,15 +339,14 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
                                struct task_struct *wakee,
                                struct task_struct *cur,
                                unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
-                   struct trace_array_cpu *data,
-                   unsigned long arg1,
-                   unsigned long arg2,
-                   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+                    unsigned long ip,
+                    unsigned long parent_ip,
+                    unsigned long flags, int pc);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
@@ -380,8 +363,15 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+enum trace_file_type {
+        TRACE_FILE_LAT_FMT      = 1,
+        TRACE_FILE_ANNOTATE     = 2,
+};
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
-extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+#define for_each_tracing_cpu(cpu)       \
+        for_each_cpu(cpu, tracing_buffer_mask)
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
@@ -452,12 +442,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
                                         struct trace_array *tr);
 extern int trace_selftest_startup_sched_switch(struct tracer *trace,
                                               struct trace_array *tr);
-extern int trace_selftest_startup_sysprof(struct tracer *trace,
-                                               struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
-extern int trace_selftest_startup_ksym(struct tracer *trace,
-                                         struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +457,8 @@ trace_array_vprintk(struct trace_array *tr,
                    unsigned long ip, const char *fmt, va_list args);
 int trace_array_printk(struct trace_array *tr,
                       unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
 extern unsigned long trace_flags;
@@ -617,6 +605,7 @@ enum trace_iterator_flags {
        TRACE_ITER_LATENCY_FMT          = 0x20000,
        TRACE_ITER_SLEEP_TIME           = 0x40000,
        TRACE_ITER_GRAPH_TIME           = 0x80000,
+        TRACE_ITER_RECORD_CMD           = 0x100000,
 };
 /*
@@ -628,54 +617,6 @@ enum trace_iterator_flags {
 extern struct tracer nop_trace;
-/**
- * ftrace_preempt_disable - disable preemption scheduler safe
- *
- * When tracing can happen inside the scheduler, there exists
- * cases that the tracing might happen before the need_resched
- * flag is checked. If this happens and the tracer calls
- * preempt_enable (after a disable), a schedule might take place
- * causing an infinite recursion.
- *
- * To prevent this, we read the need_resched flag before
- * disabling preemption. When we want to enable preemption we
- * check the flag, if it is set, then we call preempt_enable_no_resched.
- * Otherwise, we call preempt_enable.
- *
- * The rational for doing the above is that if need_resched is set
- * and we have yet to reschedule, we are either in an atomic location
- * (where we do not need to check for scheduling) or we are inside
- * the scheduler and do not want to resched.
- */
-static inline int ftrace_preempt_disable(void)
-{
-        int resched;
-        resched = need_resched();
-        preempt_disable_notrace();
-        return resched;
-}
-/**
- * ftrace_preempt_enable - enable preemption scheduler safe
- * @resched: the return value from ftrace_preempt_disable
- *
- * This is a scheduler safe way to enable preemption and not miss
- * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we are either inside an atomic or
- * are inside the scheduler (we would have already scheduled
- * otherwise). In this case, we do not want to call normal
- * preempt_enable, but preempt_enable_no_resched instead.
- */
-static inline void ftrace_preempt_enable(int resched)
-{
-        if (resched)
-                preempt_enable_no_resched_notrace();
-        else
-                preempt_enable_notrace();
-}
 #ifdef CONFIG_BRANCH_TRACER
 extern int enable_branch_tracing(struct trace_array *tr);
 extern void disable_branch_tracing(void);
@@ -766,6 +707,8 @@ struct filter_pred {
        int                     pop_n;
 };
+extern struct list_head ftrace_common_fields;
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
@@ -795,6 +738,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
        return 0;
 }
+extern void trace_event_enable_cmd_record(bool enable);
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * ring buffer based initcalls tracer
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/time.h>
-#include "trace.h"
-#include "trace_output.h"
-static struct trace_array *boot_trace;
-static bool pre_initcalls_finished;
-/* Tells the boot tracer that the pre_smp_initcalls are finished.
- * So we are ready .
- * It doesn't enable sched events tracing however.
- * You have to call enable_boot_trace to do so.
- */
-void start_boot_trace(void)
-{
-        pre_initcalls_finished = true;
-}
-void enable_boot_trace(void)
-{
-        if (boot_trace && pre_initcalls_finished)
-                tracing_start_sched_switch_record();
-}
-void disable_boot_trace(void)
-{
-        if (boot_trace && pre_initcalls_finished)
-                tracing_stop_sched_switch_record();
-}
-static int boot_trace_init(struct trace_array *tr)
-{
-        boot_trace = tr;
-        if (!tr)
-                return 0;
-        tracing_reset_online_cpus(tr);
-        tracing_sched_switch_assign_trace(tr);
-        return 0;
-}
-static enum print_line_t
-initcall_call_print_line(struct trace_iterator *iter)
-{
-        struct trace_entry *entry = iter->ent;
-        struct trace_seq *s = &iter->seq;
-        struct trace_boot_call *field;
-        struct boot_trace_call *call;
-        u64 ts;
-        unsigned long nsec_rem;
-        int ret;
-        trace_assign_type(field, entry);
-        call = &field->boot_call;
-        ts = iter->ts;
-        nsec_rem = do_div(ts, NSEC_PER_SEC);
-        ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-                        (unsigned long)ts, nsec_rem, call->func, call->caller);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        else
-                return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t
-initcall_ret_print_line(struct trace_iterator *iter)
-{
-        struct trace_entry *entry = iter->ent;
-        struct trace_seq *s = &iter->seq;
-        struct trace_boot_ret *field;
-        struct boot_trace_ret *init_ret;
-        u64 ts;
-        unsigned long nsec_rem;
-        int ret;
-        trace_assign_type(field, entry);
-        init_ret = &field->boot_ret;
-        ts = iter->ts;
-        nsec_rem = do_div(ts, NSEC_PER_SEC);
-        ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-                        "returned %d after %llu msecs\n",
-                        (unsigned long) ts,
-                        nsec_rem,
-                        init_ret->func, init_ret->result, init_ret->duration);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        else
-                return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
-{
-        struct trace_entry *entry = iter->ent;
-        switch (entry->type) {
-        case TRACE_BOOT_CALL:
-                return initcall_call_print_line(iter);
-        case TRACE_BOOT_RET:
-                return initcall_ret_print_line(iter);
-        default:
-                return TRACE_TYPE_UNHANDLED;
-        }
-}
-struct tracer boot_tracer __read_mostly =
-{
-        .name           = "initcall",
-        .init           = boot_trace_init,
-        .reset          = tracing_reset_online_cpus,
-        .print_line     = initcall_print_line,
-};
-void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
-{
-        struct ftrace_event_call *call = &event_boot_call;
-        struct ring_buffer_event *event;
-        struct ring_buffer *buffer;
-        struct trace_boot_call *entry;
-        struct trace_array *tr = boot_trace;
-        if (!tr || !pre_initcalls_finished)
-                return;
-        /* Get its name now since this function could
-         * disappear because it is in the .init section.
-         */
-        sprint_symbol(bt->func, (unsigned long)fn);
-        preempt_disable();
-        buffer = tr->buffer;
-        event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
-                                          sizeof(*entry), 0, 0);
-        if (!event)
-                goto out;
-        entry   = ring_buffer_event_data(event);
-        entry->boot_call = *bt;
-        if (!filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-        preempt_enable();
-}
-void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
-{
-        struct ftrace_event_call *call = &event_boot_ret;
-        struct ring_buffer_event *event;
-        struct ring_buffer *buffer;
-        struct trace_boot_ret *entry;
-        struct trace_array *tr = boot_trace;
-        if (!tr || !pre_initcalls_finished)
-                return;
-        sprint_symbol(bt->func, (unsigned long)fn);
-        preempt_disable();
-        buffer = tr->buffer;
-        event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
-                                          sizeof(*entry), 0, 0);
-        if (!event)
-                goto out;
-        entry   = ring_buffer_event_data(event);
-        entry->boot_ret = *bt;
-        if (!filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-        preempt_enable();
-}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
 u64 notrace trace_clock_local(void)
 {
        u64 clock;
-        int resched;
        /*
         * sched_clock() is an architecture implemented, fast, scalable,
         * lockless clock. It is not guaranteed to be coherent across
         * CPUs, nor across CPU idle events.
         */
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        clock = sched_clock();
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
        return clock;
 }
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
 */
 u64 notrace trace_clock(void)
 {
-        return cpu_clock(raw_smp_processor_id());
+        return local_clock();
 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 );
 /*
- * Special (free-form) trace entry:
- */
-FTRACE_ENTRY(special, special_entry,
-        TRACE_SPECIAL,
-        F_STRUCT(
-                __field(        unsigned long,  arg1    )
-                __field(        unsigned long,  arg2    )
-                __field(        unsigned long,  arg3    )
-        ),
-        F_printk("(%08lx) (%08lx) (%08lx)",
-                 __entry->arg1, __entry->arg2, __entry->arg3)
-);
-/*
 * Stack-trace entry:
 */
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
                 __entry->map_id, __entry->opcode)
 );
-FTRACE_ENTRY(boot_call, trace_boot_call,
-        TRACE_BOOT_CALL,
-        F_STRUCT(
-                __field_struct( struct boot_trace_call, boot_call       )
-                __field_desc(   pid_t,  boot_call,      caller          )
-                __array_desc(   char,   boot_call,      func,   KSYM_SYMBOL_LEN)
-        ),
-        F_printk("%d  %s", __entry->caller, __entry->func)
-);
-FTRACE_ENTRY(boot_ret, trace_boot_ret,
-        TRACE_BOOT_RET,
-        F_STRUCT(
-                __field_struct( struct boot_trace_ret,  boot_ret        )
-                __array_desc(   char,   boot_ret,       func,   KSYM_SYMBOL_LEN)
-                __field_desc(   int,    boot_ret,       result          )
-                __field_desc(   unsigned long, boot_ret, duration       )
-        ),
-        F_printk("%s %d %lx",
-                 __entry->func, __entry->result, __entry->duration)
-);
 #define TRACE_FUNC_SIZE 30
 #define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
                 __entry->func, __entry->file, __entry->correct)
 );
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-        TRACE_KMEM_ALLOC,
-        F_STRUCT(
-                __field(        enum kmemtrace_type_id, type_id         )
-                __field(        unsigned long,          call_site       )
-                __field(        const void *,           ptr             )
-                __field(        size_t,                 bytes_req       )
-                __field(        size_t,                 bytes_alloc     )
-                __field(        gfp_t,                  gfp_flags       )
-                __field(        int,                    node            )
-        ),
-        F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
-                 " flags:%x node:%d",
-                 __entry->type_id, __entry->call_site, __entry->ptr,
-                 __entry->bytes_req, __entry->bytes_alloc,
-                 __entry->gfp_flags, __entry->node)
-);
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-        TRACE_KMEM_FREE,
-        F_STRUCT(
-                __field(        enum kmemtrace_type_id, type_id         )
-                __field(        unsigned long,          call_site       )
-                __field(        const void *,           ptr             )
-        ),
-        F_printk("type:%u call_site:%lx ptr:%p",
-                 __entry->type_id, __entry->call_site, __entry->ptr)
-);
-FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
-        TRACE_KSYM,
-        F_STRUCT(
-                __field(        unsigned long,  ip                        )
-                __field(        unsigned char,  type                      )
-                __array(        char         ,  cmd,       TASK_COMM_LEN  )
-                __field(        unsigned long,  addr                      )
-        ),
-        F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
-                (void *)__entry->ip, (unsigned int)__entry->type,
-                (void *)__entry->addr,  __entry->cmd)
-);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a2b73f7c068..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,9 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
-static char *perf_trace_buf[4];
 /*
 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -26,7 +24,7 @@ static int	total_ref_count;
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
-        struct hlist_head *list;
+        struct hlist_head __percpu *list;
        int ret = -ENOMEM;
        int cpu;
@@ -44,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
        tp_event->perf_events = list;
        if (!total_ref_count) {
-                char *buf;
+                char __percpu *buf;
                int i;
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-                        buf = (char *)alloc_percpu(perf_trace_t);
+                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
                        if (!buf)
                                goto fail;
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                }
        }
-        if (tp_event->class->reg)
+        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
-                ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
-        else
-                ret = tracepoint_probe_register(tp_event->name,
-                                                tp_event->class->perf_probe,
-                                                tp_event);
        if (ret)
                goto fail;
@@ -73,7 +65,7 @@ fail:
        if (!total_ref_count) {
                int i;
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
@@ -96,11 +88,11 @@ int perf_trace_init(struct perf_event *p_event)
        mutex_lock(&event_mutex);
        list_for_each_entry(tp_event, &ftrace_events, list) {
                if (tp_event->event.type == event_id &&
-                    tp_event->class &&
+                    tp_event->class && tp_event->class->reg &&
-                    (tp_event->class->perf_probe ||
-                     tp_event->class->reg) &&
                    try_module_get(tp_event->mod)) {
                        ret = perf_trace_event_init(tp_event, p_event);
+                        if (ret)
+                                module_put(tp_event->mod);
                        break;
                }
        }
@@ -109,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
        return ret;
 }
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
 {
        struct ftrace_event_call *tp_event = p_event->tp_event;
+        struct hlist_head __percpu *pcpu_list;
        struct hlist_head *list;
-        list = tp_event->perf_events;
+        pcpu_list = tp_event->perf_events;
-        if (WARN_ON_ONCE(!list))
+        if (WARN_ON_ONCE(!pcpu_list))
                return -EINVAL;
-        list = this_cpu_ptr(list);
+        if (!(flags & PERF_EF_START))
+                p_event->hw.state = PERF_HES_STOPPED;
+        list = this_cpu_ptr(pcpu_list);
        hlist_add_head_rcu(&p_event->hlist_entry, list);
        return 0;
 }
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
 {
        hlist_del_rcu(&p_event->hlist_entry);
 }
@@ -138,29 +134,25 @@ void perf_trace_destroy(struct perf_event *p_event)
        if (--tp_event->perf_refcount > 0)
                goto out;
-        if (tp_event->class->reg)
+        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-                tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-        else
-                tracepoint_probe_unregister(tp_event->name,
-                                            tp_event->class->perf_probe,
-                                            tp_event);
        /*
-         * Ensure our callback won't be called anymore. See
+         * Ensure our callback won't be called anymore. The buffers
-         * tracepoint_probe_unregister() and __DO_TRACE().
+         * will be freed after that.
         */
-        synchronize_sched();
+        tracepoint_synchronize_unregister();
        free_percpu(tp_event->perf_events);
        tp_event->perf_events = NULL;
        if (!--total_ref_count) {
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
        }
 out:
+        module_put(tp_event->mod);
        mutex_unlock(&event_mutex);
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..0725eeab1937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
 DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
+LIST_HEAD(ftrace_common_fields);
 struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
        return event_call->class->get_fields(event_call);
 }
-int trace_define_field(struct ftrace_event_call *call, const char *type,
+static int __trace_define_field(struct list_head *head, const char *type,
-                       const char *name, int offset, int size, int is_signed,
+                                const char *name, int offset, int size,
-                       int filter_type)
+                                int is_signed, int filter_type)
 {
        struct ftrace_event_field *field;
-        struct list_head *head;
-        if (WARN_ON(!call->class))
-                return 0;
        field = kzalloc(sizeof(*field), GFP_KERNEL);
        if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        field->size = size;
        field->is_signed = is_signed;
-        head = trace_get_fields(call);
        list_add(&field->link, head);
        return 0;
@@ -80,17 +76,32 @@ err:
        return -ENOMEM;
 }
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+                       const char *name, int offset, int size, int is_signed,
+                       int filter_type)
+{
+        struct list_head *head;
+        if (WARN_ON(!call->class))
+                return 0;
+        head = trace_get_fields(call);
+        return __trace_define_field(head, type, name, offset, size,
+                                    is_signed, filter_type);
+}
 EXPORT_SYMBOL_GPL(trace_define_field);
 #define __common_field(type, item)                                      \
-        ret = trace_define_field(call, #type, "common_" #item,          \
+        ret = __trace_define_field(&ftrace_common_fields, #type,        \
-                                 offsetof(typeof(ent), item),           \
+                                   "common_" #item,                     \
-                                 sizeof(ent.item),                      \
+                                   offsetof(typeof(ent), item),         \
-                                 is_signed_type(type), FILTER_OTHER);   \
+                                   sizeof(ent.item),                    \
+                                   is_signed_type(type), FILTER_OTHER); \
        if (ret)                                                        \
                return ret;
-static int trace_define_common_fields(struct ftrace_event_call *call)
+static int trace_define_common_fields(void)
 {
        int ret;
        struct trace_entry ent;
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return tracepoint_probe_register(call->name,
+                                                 call->class->probe,
+                                                 call);
+        case TRACE_REG_UNREGISTER:
+                tracepoint_probe_unregister(call->name,
+                                            call->class->probe,
+                                            call);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return tracepoint_probe_register(call->name,
+                                                 call->class->perf_probe,
+                                                 call);
+        case TRACE_REG_PERF_UNREGISTER:
+                tracepoint_probe_unregister(call->name,
+                                            call->class->perf_probe,
+                                            call);
+                return 0;
+#endif
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+void trace_event_enable_cmd_record(bool enable)
+{
+        struct ftrace_event_call *call;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+                        continue;
+                if (enable) {
+                        tracing_start_cmdline_record();
+                        call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+                } else {
+                        tracing_stop_cmdline_record();
+                        call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+                }
+        }
+        mutex_unlock(&event_mutex);
+}
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
        case 0:
                if (call->flags & TRACE_EVENT_FL_ENABLED) {
                        call->flags &= ~TRACE_EVENT_FL_ENABLED;
-                        tracing_stop_cmdline_record();
+                        if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
-                        if (call->class->reg)
+                                tracing_stop_cmdline_record();
-                                call->class->reg(call, TRACE_REG_UNREGISTER);
+                                call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
-                        else
+                        }
-                                tracepoint_probe_unregister(call->name,
+                        call->class->reg(call, TRACE_REG_UNREGISTER);
-                                                            call->class->probe,
-                                                            call);
                }
                break;
        case 1:
                if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
-                        tracing_start_cmdline_record();
+                        if (trace_flags & TRACE_ITER_RECORD_CMD) {
-                        if (call->class->reg)
+                                tracing_start_cmdline_record();
-                                ret = call->class->reg(call, TRACE_REG_REGISTER);
+                                call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
-                        else
+                        }
-                                ret = tracepoint_probe_register(call->name,
+                        ret = call->class->reg(call, TRACE_REG_REGISTER);
-                                                                call->class->probe,
-                                                                call);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->name || !call->class ||
+                if (!call->name || !call->class || !call->class->reg)
-                    (!call->class->probe && !call->class->reg))
                        continue;
                if (match &&
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
-                if (call->class && (call->class->probe || call->class->reg))
+                if (call->class && call->class->reg)
                        return call;
        }
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->name || !call->class ||
+                if (!call->name || !call->class || !call->class->reg)
-                    (!call->class->probe && !call->class->reg))
                        continue;
                if (system && strcmp(call->class->system, system) != 0)
@@ -544,85 +598,146 @@ out:
        return ret;
 }
-static ssize_t
+enum {
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+        FORMAT_HEADER           = 1,
-                  loff_t *ppos)
+        FORMAT_FIELD_SEPERATOR  = 2,
+        FORMAT_PRINTFMT         = 3,
+};
+static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_call *call = m->private;
        struct ftrace_event_field *field;
-        struct list_head *head;
+        struct list_head *common_head = &ftrace_common_fields;
-        struct trace_seq *s;
+        struct list_head *head = trace_get_fields(call);
-        int common_field_count = 5;
-        char *buf;
-        int r = 0;
-        if (*ppos)
+        (*pos)++;
-                return 0;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        switch ((unsigned long)v) {
-        if (!s)
+        case FORMAT_HEADER:
-                return -ENOMEM;
+                if (unlikely(list_empty(common_head)))
+                        return NULL;
-        trace_seq_init(s);
+                field = list_entry(common_head->prev,
+                                   struct ftrace_event_field, link);
+                return field;
-        trace_seq_printf(s, "name: %s\n", call->name);
+        case FORMAT_FIELD_SEPERATOR:
-        trace_seq_printf(s, "ID: %d\n", call->event.type);
+                if (unlikely(list_empty(head)))
-        trace_seq_printf(s, "format:\n");
+                        return NULL;
-        head = trace_get_fields(call);
+                field = list_entry(head->prev, struct ftrace_event_field, link);
-        list_for_each_entry_reverse(field, head, link) {
+                return field;
-                /*
-                 * Smartly shows the array type(except dynamic array).
-                 * Normal:
-                 *      field:TYPE VAR
-                 * If TYPE := TYPE[LEN], it is shown:
-                 *      field:TYPE VAR[LEN]
-                 */
-                const char *array_descriptor = strchr(field->type, '[');
-                if (!strncmp(field->type, "__data_loc", 10))
+        case FORMAT_PRINTFMT:
-                        array_descriptor = NULL;
+                /* all done */
+                return NULL;
+        }
-                if (!array_descriptor) {
+        field = v;
-                        r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+        if (field->link.prev == common_head)
-                                        "\tsize:%u;\tsigned:%d;\n",
+                return (void *)FORMAT_FIELD_SEPERATOR;
-                                        field->type, field->name, field->offset,
+        else if (field->link.prev == head)
-                                        field->size, !!field->is_signed);
+                return (void *)FORMAT_PRINTFMT;
-                } else {
-                        r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
-                                        "\tsize:%u;\tsigned:%d;\n",
-                                        (int)(array_descriptor - field->type),
-                                        field->type, field->name,
-                                        array_descriptor, field->offset,
-                                        field->size, !!field->is_signed);
-                }
-                if (--common_field_count == 0)
+        field = list_entry(field->link.prev, struct ftrace_event_field, link);
-                        r = trace_seq_printf(s, "\n");
-                if (!r)
+        return field;
-                        break;
+}
-        }
-        if (r)
+static void *f_start(struct seq_file *m, loff_t *pos)
-                r = trace_seq_printf(s, "\nprint fmt: %s\n",
+{
-                                call->print_fmt);
+        loff_t l = 0;
+        void *p;
-        if (!r) {
+        /* Start by showing the header */
-                /*
+        if (!*pos)
-                 * ug!  The format output is bigger than a PAGE!!
+                return (void *)FORMAT_HEADER;
-                 */
-                buf = "FORMAT TOO BIG\n";
+        p = (void *)FORMAT_HEADER;
-                r = simple_read_from_buffer(ubuf, cnt, ppos,
+        do {
-                                              buf, strlen(buf));
+                p = f_next(m, p, &l);
-                goto out;
+        } while (p && l < *pos);
+        return p;
+}
+static int f_show(struct seq_file *m, void *v)
+{
+        struct ftrace_event_call *call = m->private;
+        struct ftrace_event_field *field;
+        const char *array_descriptor;
+        switch ((unsigned long)v) {
+        case FORMAT_HEADER:
+                seq_printf(m, "name: %s\n", call->name);
+                seq_printf(m, "ID: %d\n", call->event.type);
+                seq_printf(m, "format:\n");
+                return 0;
+        case FORMAT_FIELD_SEPERATOR:
+                seq_putc(m, '\n');
+                return 0;
+        case FORMAT_PRINTFMT:
+                seq_printf(m, "\nprint fmt: %s\n",
+                           call->print_fmt);
+                return 0;
        }
-        r = simple_read_from_buffer(ubuf, cnt, ppos,
+        field = v;
-                                    s->buffer, s->len);
- out:
+        /*
-        kfree(s);
+         * Smartly shows the array type(except dynamic array).
-        return r;
+         * Normal:
+         *      field:TYPE VAR
+         * If TYPE := TYPE[LEN], it is shown:
+         *      field:TYPE VAR[LEN]
+         */
+        array_descriptor = strchr(field->type, '[');
+        if (!strncmp(field->type, "__data_loc", 10))
+                array_descriptor = NULL;
+        if (!array_descriptor)
+                seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+                           field->type, field->name, field->offset,
+                           field->size, !!field->is_signed);
+        else
+                seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+                           (int)(array_descriptor - field->type),
+                           field->type, field->name,
+                           array_descriptor, field->offset,
+                           field->size, !!field->is_signed);
+        return 0;
+}
+static void f_stop(struct seq_file *m, void *p)
+{
+}
+static const struct seq_operations trace_format_seq_ops = {
+        .start          = f_start,
+        .next           = f_next,
+        .stop           = f_stop,
+        .show           = f_show,
+};
+static int trace_format_open(struct inode *inode, struct file *file)
+{
+        struct ftrace_event_call *call = inode->i_private;
+        struct seq_file *m;
+        int ret;
+        ret = seq_open(file, &trace_format_seq_ops);
+        if (ret < 0)
+                return ret;
+        m = file->private_data;
+        m->private = call;
+        return 0;
 }
 static ssize_t
@@ -817,39 +932,47 @@ static const struct file_operations ftrace_enable_fops = {
        .open = tracing_open_generic,
        .read = event_enable_read,
        .write = event_enable_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_event_format_fops = {
-        .open = tracing_open_generic,
+        .open = trace_format_open,
-        .read = event_format_read,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = seq_release,
 };
 static const struct file_operations ftrace_event_id_fops = {
        .open = tracing_open_generic,
        .read = event_id_read,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_event_filter_fops = {
        .open = tracing_open_generic,
        .read = event_filter_read,
        .write = event_filter_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_subsystem_filter_fops = {
        .open = tracing_open_generic,
        .read = subsystem_filter_read,
        .write = subsystem_filter_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_system_enable_fops = {
        .open = tracing_open_generic,
        .read = system_enable_read,
        .write = system_enable_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_show_header_fops = {
        .open = tracing_open_generic,
        .read = show_header,
+        .llseek = default_llseek,
 };
 static struct dentry *event_trace_events_dir(void)
@@ -963,35 +1086,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                return -1;
        }
-        if (call->class->probe || call->class->reg)
+        if (call->class->reg)
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
 #ifdef CONFIG_PERF_EVENTS
-        if (call->event.type && (call->class->perf_probe || call->class->reg))
+        if (call->event.type && call->class->reg)
                trace_create_file("id", 0444, call->dir, call,
                                  id);
 #endif
-        if (call->class->define_fields) {
+        /*
-                /*
+         * Other events may have the same class. Only update
-                 * Other events may have the same class. Only update
+         * the fields if they are not already defined.
-                 * the fields if they are not already defined.
+         */
-                 */
+        head = trace_get_fields(call);
-                head = trace_get_fields(call);
+        if (list_empty(head)) {
-                if (list_empty(head)) {
+                ret = call->class->define_fields(call);
-                        ret = trace_define_common_fields(call);
+                if (ret < 0) {
-                        if (!ret)
+                        pr_warning("Could not initialize trace point"
-                                ret = call->class->define_fields(call);
+                                   " events/%s\n", call->name);
-                        if (ret < 0) {
+                        return ret;
-                                pr_warning("Could not initialize trace point"
-                                           " events/%s\n", call->name);
-                                return ret;
-                        }
                }
-                trace_create_file("filter", 0644, call->dir, call,
-                                  filter);
        }
+        trace_create_file("filter", 0644, call->dir, call,
+                          filter);
        trace_create_file("format", 0444, call->dir, call,
                          format);
@@ -999,11 +1118,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
        return 0;
 }
-static int __trace_add_event_call(struct ftrace_event_call *call)
+static int
+__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
+                       const struct file_operations *id,
+                       const struct file_operations *enable,
+                       const struct file_operations *filter,
+                       const struct file_operations *format)
 {
        struct dentry *d_events;
        int ret;
+        /* The linker may leave blanks */
        if (!call->name)
                return -EINVAL;
@@ -1011,8 +1136,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
                ret = call->class->raw_init(call);
                if (ret < 0) {
                        if (ret != -ENOSYS)
-                                pr_warning("Could not initialize trace "
+                                pr_warning("Could not initialize trace events/%s\n",
-                                "events/%s\n", call->name);
+                                           call->name);
                        return ret;
                }
        }
@@ -1021,11 +1146,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
        if (!d_events)
                return -ENOENT;
-        ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+        ret = event_create_dir(call, d_events, id, enable, filter, format);
-                                &ftrace_enable_fops, &ftrace_event_filter_fops,
-                                &ftrace_event_format_fops);
        if (!ret)
                list_add(&call->list, &ftrace_events);
+        call->mod = mod;
        return ret;
 }
@@ -1035,7 +1159,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
 {
        int ret;
        mutex_lock(&event_mutex);
-        ret = __trace_add_event_call(call);
+        ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+                                     &ftrace_enable_fops,
+                                     &ftrace_event_filter_fops,
+                                     &ftrace_event_format_fops);
        mutex_unlock(&event_mutex);
        return ret;
 }
@@ -1152,8 +1279,6 @@ static void trace_module_add_events(struct module *mod)
 {
        struct ftrace_module_file_ops *file_ops = NULL;
        struct ftrace_event_call *call, *start, *end;
-        struct dentry *d_events;
-        int ret;
        start = mod->trace_events;
        end = mod->trace_events + mod->num_trace_events;
@@ -1161,38 +1286,14 @@ static void trace_module_add_events(struct module *mod)
        if (start == end)
                return;
-        d_events = event_trace_events_dir();
+        file_ops = trace_create_file_ops(mod);
-        if (!d_events)
+        if (!file_ops)
                return;
        for_each_event(call, start, end) {
-                /* The linker may leave blanks */
+                __trace_add_event_call(call, mod,
-                if (!call->name)
-                        continue;
-                if (call->class->raw_init) {
-                        ret = call->class->raw_init(call);
-                        if (ret < 0) {
-                                if (ret != -ENOSYS)
-                                        pr_warning("Could not initialize trace "
-                                        "point events/%s\n", call->name);
-                                continue;
-                        }
-                }
-                /*
-                 * This module has events, create file ops for this module
-                 * if not already done.
-                 */
-                if (!file_ops) {
-                        file_ops = trace_create_file_ops(mod);
-                        if (!file_ops)
-                                return;
-                }
-                call->mod = mod;
-                ret = event_create_dir(call, d_events,
                                       &file_ops->id, &file_ops->enable,
                                       &file_ops->filter, &file_ops->format);
-                if (!ret)
-                        list_add(&call->list, &ftrace_events);
        }
 }
@@ -1319,25 +1420,14 @@ static __init int event_trace_init(void)
        trace_create_file("enable", 0644, d_events,
                          NULL, &ftrace_system_enable_fops);
+        if (trace_define_common_fields())
+                pr_warning("tracing: Failed to allocate common fields");
        for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-                /* The linker may leave blanks */
+                __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
-                if (!call->name)
-                        continue;
-                if (call->class->raw_init) {
-                        ret = call->class->raw_init(call);
-                        if (ret < 0) {
-                                if (ret != -ENOSYS)
-                                        pr_warning("Could not initialize trace "
-                                        "point events/%s\n", call->name);
-                                continue;
-                        }
-                }
-                ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
                                       &ftrace_enable_fops,
                                       &ftrace_event_filter_fops,
                                       &ftrace_event_format_fops);
-                if (!ret)
-                        list_add(&call->list, &ftrace_events);
        }
        while (true) {
@@ -1524,12 +1614,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
        struct ftrace_entry *entry;
        unsigned long flags;
        long disabled;
-        int resched;
        int cpu;
        int pc;
        pc = preempt_count();
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
        disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -1551,7 +1640,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 out:
        atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
 }
 static struct ftrace_ops trace_ops __initdata  =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 }
 static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
+__find_event_field(struct list_head *head, char *name)
 {
        struct ftrace_event_field *field;
-        struct list_head *head;
-        head = trace_get_fields(call);
        list_for_each_entry(field, head, link) {
                if (!strcmp(field->name, name))
                        return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
        return NULL;
 }
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+        struct ftrace_event_field *field;
+        struct list_head *head;
+        field = __find_event_field(&ftrace_common_fields, name);
+        if (field)
+                return field;
+        head = trace_get_fields(call);
+        return __find_event_field(head, name);
+}
 static void filter_free_pred(struct filter_pred *pred)
 {
        if (!pred)
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
        int err;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->class || !call->class->define_fields)
-                        continue;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
        struct ftrace_event_call *call;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->class || !call->class->define_fields)
-                        continue;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
        list_for_each_entry(call, &ftrace_events, list) {
                struct event_filter *filter = call->filter;
-                if (!call->class || !call->class->define_fields)
-                        continue;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #include "trace_entries.h"
-static int ftrace_raw_init_event(struct ftrace_event_call *call)
-{
-        INIT_LIST_HEAD(&call->class->fields);
-        return 0;
-}
 #undef __entry
 #define __entry REC
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 struct ftrace_event_class event_class_ftrace_##call = {                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .define_fields          = ftrace_define_fields_##call,          \
-        .raw_init               = ftrace_raw_init_event,                \
+        .fields                 = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 };                                                                      \
                                                                        \
 struct ftrace_event_call __used                                         \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
        struct trace_array_cpu *data;
        unsigned long flags;
        long disabled;
-        int cpu, resched;
+        int cpu;
        int pc;
        if (unlikely(!ftrace_function_enabled))
                return;
        pc = preempt_count();
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        local_save_flags(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
                trace_function(tr, ip, parent_ip, flags, pc);
        atomic_dec(&data->disabled);
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
 }
 static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
 #include "trace.h"
 #include "trace_output.h"
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
 struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             depth_irq;
        int             ignore;
        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 struct fgraph_data {
-        struct fgraph_cpu_data          *cpu_data;
+        struct fgraph_cpu_data __percpu *cpu_data;
        /* Place to preserve last processed entry. */
        struct ftrace_graph_ent_entry   ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+#define TRACE_GRAPH_PRINT_IRQS          0x40
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
        { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
        /* Display absolute time of an entry */
        { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+        /* Display interrupts */
+        { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
        { } /* Empty entry */
 };
 static struct tracer_flags tracer_flags = {
        /* Don't display overruns and proc by default */
        .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-               TRACE_GRAPH_PRINT_DURATION,
+               TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
        .opts = trace_opts
 };
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
        return 1;
 }
+static inline int ftrace_graph_ignore_irqs(void)
+{
+        if (!ftrace_graph_skip_irqs)
+                return 0;
+        return in_irq();
+}
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
        struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                return 0;
        /* trace it when it is-nested-in or is a function enabled. */
-        if (!(trace->depth || ftrace_graph_addr(trace->func)))
+        if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+              ftrace_graph_ignore_irqs())
                return 0;
        local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                return trace_graph_entry(trace);
 }
+static void
+__trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long flags, int pc)
+{
+        u64 time = trace_clock_local();
+        struct ftrace_graph_ent ent = {
+                .func  = ip,
+                .depth = 0,
+        };
+        struct ftrace_graph_ret ret = {
+                .func     = ip,
+                .depth    = 0,
+                .calltime = time,
+                .rettime  = time,
+        };
+        __trace_graph_entry(tr, &ent, flags, pc);
+        __trace_graph_return(tr, &ret, flags, pc);
+}
+void
+trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long parent_ip,
+                unsigned long flags, int pc)
+{
+        __trace_graph_function(tr, ip, flags, pc);
+}
 void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
@@ -507,7 +551,15 @@ get_return_for_leaf(struct trace_iterator *iter,
                         * if the output fails.
                         */
                        data->ent = *curr;
-                        data->ret = *next;
+                        /*
+                         * If the next event is not a return type, then
+                         * we only care about what type it is. Otherwise we can
+                         * safely copy the entire event.
+                         */
+                        if (next->ent.type == TRACE_GRAPH_RET)
+                                data->ret = *next;
+                        else
+                                data->ret.ent.type = next->ent.type;
                }
        }
@@ -641,7 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
        /* Print nsecs (we don't want to exceed 7 numbers) */
        if (len < 7) {
-                snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+                size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
+                snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
                ret = trace_seq_printf(s, ".%s", nsecs_str);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -846,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
        return 0;
 }
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just extered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+                unsigned long addr, int depth)
+{
+        int cpu = iter->cpu;
+        int *depth_irq;
+        struct fgraph_data *data = iter->private;
+        /*
+         * If we are either displaying irqs, or we got called as
+         * a graph event and private data does not exist,
+         * then we bypass the irq check.
+         */
+        if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+            (!data))
+                return 0;
+        depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+        /*
+         * We are inside the irq code
+         */
+        if (*depth_irq >= 0)
+                return 1;
+        if ((addr < (unsigned long)__irqentry_text_start) ||
+            (addr >= (unsigned long)__irqentry_text_end))
+                return 0;
+        /*
+         * We are entering irq code.
+         */
+        *depth_irq = depth;
+        return 1;
+}
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+        int cpu = iter->cpu;
+        int *depth_irq;
+        struct fgraph_data *data = iter->private;
+        /*
+         * If we are either displaying irqs, or we got called as
+         * a graph event and private data does not exist,
+         * then we bypass the irq check.
+         */
+        if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+            (!data))
+                return 0;
+        depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+        /*
+         * We are not inside the irq code.
+         */
+        if (*depth_irq == -1)
+                return 0;
+        /*
+         * We are inside the irq code, and this is returning entry.
+         * Let's not trace it and clear the entry depth, since
+         * we are out of irq code.
+         *
+         * This condition ensures that we 'leave the irq code' once
+         * we are out of the entry depth. Thus protecting us from
+         * the RETURN entry loss.
+         */
+        if (*depth_irq >= depth) {
+                *depth_irq = -1;
+                return 1;
+        }
+        /*
+         * We are inside the irq code, and this is not the entry.
+         */
+        return 1;
+}
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter, u32 flags)
@@ -856,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        static enum print_line_t ret;
        int cpu = iter->cpu;
+        if (check_irq_entry(iter, flags, call->func, call->depth))
+                return TRACE_TYPE_HANDLED;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -893,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        int ret;
        int i;
+        if (check_irq_return(iter, flags, trace->depth))
+                return TRACE_TYPE_HANDLED;
        if (data) {
                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
@@ -1045,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1108,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-        return print_graph_function_flags(iter, tracer_flags.val);
+        return __print_graph_function_flags(iter, tracer_flags.val);
+}
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+                                             u32 flags)
+{
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        return __print_graph_function_flags(iter, flags);
 }
 static enum print_line_t
@@ -1140,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
@@ -1181,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
        print_graph_headers_flags(s, tracer_flags.val);
 }
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+        struct trace_iterator *iter = s->private;
+        if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+                /* print nothing if the buffers are empty */
+                if (trace_empty(iter))
+                        return;
+                print_trace_header(s, iter);
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        } else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        __print_graph_headers_flags(s, flags);
+}
 void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
@@ -1201,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
                *pid = -1;
                *depth = 0;
                *ignore = 0;
+                *depth_irq = -1;
        }
        iter->private = data;
@@ -1226,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
        }
 }
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+        if (bit == TRACE_GRAPH_PRINT_IRQS)
+                ftrace_graph_skip_irqs = !set;
+        return 0;
+}
 static struct trace_event_functions graph_functions = {
        .trace          = print_graph_function_event,
 };
@@ -1252,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
        .print_line     = print_graph_function,
        .print_header   = print_graph_headers,
        .flags          = &tracer_flags,
+        .set_flag       = func_graph_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_function_graph,
 #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp	unsigned long max_sequence;
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
 */
-static void
+static int func_prolog_dec(struct trace_array *tr,
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+                           struct trace_array_cpu **data,
+                           unsigned long *flags)
 {
-        struct trace_array *tr = irqsoff_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
        long disabled;
        int cpu;
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
         */
        cpu = raw_smp_processor_id();
        if (likely(!per_cpu(tracing_cpu, cpu)))
-                return;
+                return 0;
-        local_save_flags(flags);
+        local_save_flags(*flags);
        /* slight chance to get a false positive on tracing_cpu */
-        if (!irqs_disabled_flags(flags))
+        if (!irqs_disabled_flags(*flags))
-                return;
+                return 0;
-        data = tr->data[cpu];
+        *data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
+        disabled = atomic_inc_return(&(*data)->disabled);
        if (likely(disabled == 1))
-                trace_function(tr, ip, parent_ip, flags, preempt_count());
+                return 1;
+        atomic_dec(&(*data)->disabled);
+        return 0;
+}
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        if (!func_prolog_dec(tr, &data, &flags))
+                return;
+        trace_function(tr, ip, parent_ip, flags, preempt_count());
        atomic_dec(&data->disabled);
 }
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
        int ret;
-        int cpu;
        int pc;
-        cpu = raw_smp_processor_id();
+        if (!func_prolog_dec(tr, &data, &flags))
-        if (likely(!per_cpu(tracing_cpu, cpu)))
                return 0;
-        local_save_flags(flags);
+        pc = preempt_count();
-        /* slight chance to get a false positive on tracing_cpu */
+        ret = __trace_graph_entry(tr, trace, flags, pc);
-        if (!irqs_disabled_flags(flags))
-                return 0;
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1)) {
-                pc = preempt_count();
-                ret = __trace_graph_entry(tr, trace, flags, pc);
-        } else
-                ret = 0;
        atomic_dec(&data->disabled);
        return ret;
 }
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
-        int cpu;
        int pc;
-        cpu = raw_smp_processor_id();
+        if (!func_prolog_dec(tr, &data, &flags))
-        if (likely(!per_cpu(tracing_cpu, cpu)))
                return;
-        local_save_flags(flags);
+        pc = preempt_count();
-        /* slight chance to get a false positive on tracing_cpu */
+        __trace_graph_return(tr, trace, flags, pc);
-        if (!irqs_disabled_flags(flags))
-                return;
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1)) {
-                pc = preempt_count();
-                __trace_graph_return(tr, trace, flags, pc);
-        }
        atomic_dec(&data->disabled);
 }
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
-        u32 flags = GRAPH_TRACER_FLAGS;
-        if (trace_flags & TRACE_ITER_LATENCY_FMT)
-                flags |= TRACE_GRAPH_PRINT_DURATION;
-        else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
        /*
         * In graph mode call the graph tracer output function,
         * otherwise go with the TRACE_FN event handler
         */
        if (is_graph())
-                return print_graph_function_flags(iter, flags);
+                return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
        return TRACE_TYPE_UNHANDLED;
 }
 static void irqsoff_print_header(struct seq_file *s)
 {
-        if (is_graph()) {
+        if (is_graph())
-                struct trace_iterator *iter = s->private;
+                print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
-                u32 flags = GRAPH_TRACER_FLAGS;
+        else
-                if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-                        /* print nothing if the buffers are empty */
-                        if (trace_empty(iter))
-                                return;
-                        print_trace_header(s, iter);
-                        flags |= TRACE_GRAPH_PRINT_DURATION;
-                } else
-                        flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-                print_graph_headers_flags(s, flags);
-        } else
                trace_default_header(s);
 }
 static void
-trace_graph_function(struct trace_array *tr,
-                 unsigned long ip, unsigned long flags, int pc)
-{
-        u64 time = trace_clock_local();
-        struct ftrace_graph_ent ent = {
-                .func  = ip,
-                .depth = 0,
-        };
-        struct ftrace_graph_ret ret = {
-                .func     = ip,
-                .depth    = 0,
-                .calltime = time,
-                .rettime  = time,
-        };
-        __trace_graph_entry(tr, &ent, flags, pc);
-        __trace_graph_return(tr, &ret, flags, pc);
-}
-static void
 __trace_function(struct trace_array *tr,
                 unsigned long ip, unsigned long parent_ip,
                 unsigned long flags, int pc)
 {
-        if (!is_graph())
+        if (is_graph())
+                trace_graph_function(tr, ip, parent_ip, flags, pc);
+        else
                trace_function(tr, ip, parent_ip, flags, pc);
-        else {
-                trace_graph_function(tr, parent_ip, flags, pc);
-                trace_graph_function(tr, ip, flags, pc);
-        }
 }
 #else
@@ -649,6 +607,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+        .use_max_tr     = 1,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -681,6 +640,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+        .use_max_tr     = 1,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -715,6 +675,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+        .use_max_tr     = 1,
 };
 # define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..3c5c5dfea0b3
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,135 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+#include "trace.h"
+#include "trace_output.h"
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
+{
+        /* use static because iter can be a bit big for the stack */
+        static struct trace_iterator iter;
+        unsigned int old_userobj;
+        int cnt = 0, cpu;
+        trace_init_global_iter(&iter);
+        for_each_tracing_cpu(cpu) {
+                atomic_inc(&iter.tr->data[cpu]->disabled);
+        }
+        old_userobj = trace_flags;
+        /* don't look at user memory in panic mode */
+        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+        kdb_printf("Dumping ftrace buffer:\n");
+        /* reset all but tr, trace, and overruns */
+        memset(&iter.seq, 0,
+                   sizeof(struct trace_iterator) -
+                   offsetof(struct trace_iterator, seq));
+        iter.iter_flags |= TRACE_FILE_LAT_FMT;
+        iter.pos = -1;
+        if (cpu_file == TRACE_PIPE_ALL_CPU) {
+                for_each_tracing_cpu(cpu) {
+                        iter.buffer_iter[cpu] =
+                        ring_buffer_read_prepare(iter.tr->buffer, cpu);
+                        ring_buffer_read_start(iter.buffer_iter[cpu]);
+                        tracing_iter_reset(&iter, cpu);
+                }
+        } else {
+                iter.cpu_file = cpu_file;
+                iter.buffer_iter[cpu_file] =
+                        ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+                ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+                tracing_iter_reset(&iter, cpu_file);
+        }
+        if (!trace_empty(&iter))
+                trace_find_next_entry_inc(&iter);
+        while (!trace_empty(&iter)) {
+                if (!cnt)
+                        kdb_printf("---------------------------------\n");
+                cnt++;
+                if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+                        print_trace_line(&iter);
+                if (!skip_lines)
+                        trace_printk_seq(&iter.seq);
+                else
+                        skip_lines--;
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        goto out;
+        }
+        if (!cnt)
+                kdb_printf("   (ftrace buffer empty)\n");
+        else
+                kdb_printf("---------------------------------\n");
+out:
+        trace_flags = old_userobj;
+        for_each_tracing_cpu(cpu) {
+                atomic_dec(&iter.tr->data[cpu]->disabled);
+        }
+        for_each_tracing_cpu(cpu)
+                if (iter.buffer_iter[cpu])
+                        ring_buffer_read_finish(iter.buffer_iter[cpu]);
+}
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+        int skip_lines = 0;
+        long cpu_file;
+        char *cp;
+        if (argc > 2)
+                return KDB_ARGCOUNT;
+        if (argc) {
+                skip_lines = simple_strtol(argv[1], &cp, 0);
+                if (*cp)
+                        skip_lines = 0;
+        }
+        if (argc == 2) {
+                cpu_file = simple_strtol(argv[2], &cp, 0);
+                if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+                    !cpu_online(cpu_file))
+                        return KDB_BADINT;
+        } else {
+                cpu_file = TRACE_PIPE_ALL_CPU;
+        }
+        kdb_trap_printk++;
+        ftrace_dump_buf(skip_lines, cpu_file);
+        kdb_trap_printk--;
+        return 0;
+}
+static __init int kdb_ftrace_register(void)
+{
+        kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+                            "Dump ftrace log", 0, KDB_REPEAT_NONE);
+        return 0;
+}
+late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,7 @@
 #include <linux/ptrace.h>
 #include <linux/perf_event.h>
 #include <linux/stringify.h>
+#include <linux/limits.h>
 #include <asm/bitsperlong.h>
 #include "trace.h"
@@ -38,6 +39,7 @@
 #define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
 #define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
 #define KPROBE_EVENT_SYSTEM "kprobes"
 /* Reserved field names */
@@ -58,14 +60,16 @@ const char *reserved_field_names[] = {
 };
 /* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
+                                 void *);
 #define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
 #define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
 /* Printing  in basic type function template */
 #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
 static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
-                                                const char *name, void *data)\
+                                                const char *name,       \
+                                                void *data, void *ent)\
 {                                                                       \
        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
 }                                                                       \
@@ -80,6 +84,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
 DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
 DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs)      \
+        (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl)        ((u32)(dl) >> 16)
+#define get_rloc_offs(dl)       ((u32)(dl) & 0xffff)
+static inline void *get_rloc_data(u32 *dl)
+{
+        return (u8 *)dl + get_rloc_offs(*dl);
+}
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+        return (u8 *)ent + get_rloc_offs(*dl);
+}
+/*
+ * Convert data_rloc to data_loc:
+ *  data_rloc stores the offset from data_rloc itself, but data_loc
+ *  stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+                                                  const char *name,
+                                                  void *data, void *ent)
+{
+        int len = *(u32 *)data >> 16;
+        if (!len)
+                return trace_seq_printf(s, " %s=(fault)", name);
+        else
+                return trace_seq_printf(s, " %s=\"%s\"", name,
+                                        (const char *)get_loc_data(data, ent));
+}
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
 /* Data fetch function type */
 typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
@@ -94,32 +141,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
        return fprm->fn(regs, fprm->data, dest);
 }
-#define FETCH_FUNC_NAME(kind, type)     fetch_##kind##_##type
+#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
 /*
 * Define macro for basic types - we don't need to define s* types, because
 * we have to care only about bitwidth at recording time.
 */
-#define DEFINE_BASIC_FETCH_FUNCS(kind)  \
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##kind(u8)                 \
+DEFINE_FETCH_##method(u8)               \
-DEFINE_FETCH_##kind(u16)                \
+DEFINE_FETCH_##method(u16)              \
-DEFINE_FETCH_##kind(u32)                \
+DEFINE_FETCH_##method(u32)              \
-DEFINE_FETCH_##kind(u64)
+DEFINE_FETCH_##method(u64)
-#define CHECK_BASIC_FETCH_FUNCS(kind, fn)       \
+#define CHECK_FETCH_FUNCS(method, fn)                   \
-        ((FETCH_FUNC_NAME(kind, u8) == fn) ||   \
+        (((FETCH_FUNC_NAME(method, u8) == fn) ||        \
-         (FETCH_FUNC_NAME(kind, u16) == fn) ||  \
+          (FETCH_FUNC_NAME(method, u16) == fn) ||       \
-         (FETCH_FUNC_NAME(kind, u32) == fn) ||  \
+          (FETCH_FUNC_NAME(method, u32) == fn) ||       \
-         (FETCH_FUNC_NAME(kind, u64) == fn))
+          (FETCH_FUNC_NAME(method, u64) == fn) ||       \
+          (FETCH_FUNC_NAME(method, string) == fn) ||    \
+          (FETCH_FUNC_NAME(method, string_size) == fn)) \
+         && (fn != NULL))
 /* Data fetch function templates */
 #define DEFINE_FETCH_reg(type)                                          \
 static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
-                                          void *offset, void *dest)     \
+                                        void *offset, void *dest)       \
 {                                                                       \
        *(type *)dest = (type)regs_get_register(regs,                   \
                                (unsigned int)((unsigned long)offset)); \
 }
 DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
 #define DEFINE_FETCH_stack(type)                                        \
 static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +182,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
                                (unsigned int)((unsigned long)offset)); \
 }
 DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
 #define DEFINE_FETCH_retval(type)                                       \
 static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +193,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
        *(type *)dest = (type)regs_return_value(regs);                  \
 }
 DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
 #define DEFINE_FETCH_memory(type)                                       \
 static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +208,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
                *(type *)dest = retval;                                 \
 }
 DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+                                                      void *addr, void *dest)
+{
+        long ret;
+        int maxlen = get_rloc_len(*(u32 *)dest);
+        u8 *dst = get_rloc_data(dest);
+        u8 *src = addr;
+        mm_segment_t old_fs = get_fs();
+        if (!maxlen)
+                return;
+        /*
+         * Try to get string again, since the string can be changed while
+         * probing.
+         */
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do
+                ret = __copy_from_user_inatomic(dst++, src++, 1);
+        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+        dst[-1] = '\0';
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0) {  /* Failed to fetch string */
+                ((u8 *)get_rloc_data(dest))[0] = '\0';
+                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+        } else
+                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+                                              get_rloc_offs(*(u32 *)dest));
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+                                                        void *addr, void *dest)
+{
+        int ret, len = 0;
+        u8 c;
+        mm_segment_t old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do {
+                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+                len++;
+        } while (c && ret == 0 && len < MAX_STRING_SIZE);
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0)    /* Failed to check the length */
+                *(u32 *)dest = 0;
+        else
+                *(u32 *)dest = len;
+}
 /* Memory fetching by symbol */
 struct symbol_cache {
@@ -203,6 +318,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
                *(type *)dest = 0;                                      \
 }
 DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
 /* Dereference memory access function */
 struct deref_fetch_param {
@@ -224,12 +341,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
                *(type *)dest = 0;                                      \
 }
 DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
 static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
-        if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
                free_deref_fetch_param(data->orig.data);
-        else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
                free_symbol_cache(data->orig.data);
        kfree(data);
 }
@@ -240,23 +359,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
 #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-#define ASSIGN_FETCH_FUNC(kind, type)   \
+/* Fetch types */
-        .kind = FETCH_FUNC_NAME(kind, type)
+enum {
+        FETCH_MTD_reg = 0,
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)   \
+        FETCH_MTD_stack,
-        {.name = #ptype,                        \
+        FETCH_MTD_retval,
-         .size = sizeof(ftype),                 \
+        FETCH_MTD_memory,
-         .is_signed = sign,                     \
+        FETCH_MTD_symbol,
-         .print = PRINT_TYPE_FUNC_NAME(ptype),  \
+        FETCH_MTD_deref,
-         .fmt = PRINT_TYPE_FMT_NAME(ptype),     \
+        FETCH_MTD_END,
-ASSIGN_FETCH_FUNC(reg, ftype),                  \
+};
-ASSIGN_FETCH_FUNC(stack, ftype),                \
-ASSIGN_FETCH_FUNC(retval, ftype),               \
+#define ASSIGN_FETCH_FUNC(method, type) \
-ASSIGN_FETCH_FUNC(memory, ftype),               \
+        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
-ASSIGN_FETCH_FUNC(symbol, ftype),               \
-ASSIGN_FETCH_FUNC(deref, ftype),                \
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+        {.name = _name,                         \
+         .size = _size,                                 \
+         .is_signed = sign,                             \
+         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
+         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
+         .fmttype = _fmttype,                           \
+         .fetch = {                                     \
+ASSIGN_FETCH_FUNC(reg, ftype),                          \
+ASSIGN_FETCH_FUNC(stack, ftype),                        \
+ASSIGN_FETCH_FUNC(retval, ftype),                       \
+ASSIGN_FETCH_FUNC(memory, ftype),                       \
+ASSIGN_FETCH_FUNC(symbol, ftype),                       \
+ASSIGN_FETCH_FUNC(deref, ftype),                        \
+          }                                             \
        }
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
+        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
 /* Fetch type information table */
 static const struct fetch_type {
        const char      *name;          /* Name of type */
@@ -264,14 +403,16 @@ static const struct fetch_type {
        int             is_signed;      /* Signed flag */
        print_type_func_t       print;  /* Print functions */
        const char      *fmt;           /* Fromat string */
+        const char      *fmttype;       /* Name in format file */
        /* Fetch functions */
-        fetch_func_t    reg;
+        fetch_func_t    fetch[FETCH_MTD_END];
-        fetch_func_t    stack;
-        fetch_func_t    retval;
-        fetch_func_t    memory;
-        fetch_func_t    symbol;
-        fetch_func_t    deref;
 } fetch_type_table[] = {
+        /* Special types */
+        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+                                        sizeof(u32), 1, "__data_loc char[]"),
+        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+                                        string_size, sizeof(u32), 0, "u32"),
+        /* Basic types */
        ASSIGN_FETCH_TYPE(u8,  u8,  0),
        ASSIGN_FETCH_TYPE(u16, u16, 0),
        ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +443,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
        *(unsigned long *)dest = kernel_stack_pointer(regs);
 }
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+                                            fetch_func_t orig_fn)
+{
+        int i;
+        if (type != &fetch_type_table[FETCH_TYPE_STRING])
+                return NULL;    /* Only string type needs size function */
+        for (i = 0; i < FETCH_MTD_END; i++)
+                if (type->fetch[i] == orig_fn)
+                        return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+        WARN_ON(1);     /* This should not happen */
+        return NULL;
+}
 /**
 * Kprobe event core functions
 */
 struct probe_arg {
        struct fetch_param      fetch;
+        struct fetch_param      fetch_size;
        unsigned int            offset; /* Offset from argument entry */
        const char              *name;  /* Name of this argument */
        const char              *comm;  /* Command of this argument */
@@ -356,8 +513,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
                                struct pt_regs *regs);
-/* Check the name is good for event/group */
+/* Check the name is good for event/group/fields */
-static int check_event_name(const char *name)
+static int is_good_name(const char *name)
 {
        if (!isalpha(*name) && *name != '_')
                return 0;
@@ -399,7 +556,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
        else
                tp->rp.kp.pre_handler = kprobe_dispatcher;
-        if (!event || !check_event_name(event)) {
+        if (!event || !is_good_name(event)) {
                ret = -EINVAL;
                goto error;
        }
@@ -409,7 +566,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
        if (!tp->call.name)
                goto error;
-        if (!group || !check_event_name(group)) {
+        if (!group || !is_good_name(group)) {
                ret = -EINVAL;
                goto error;
        }
@@ -429,9 +586,9 @@ error:
 static void free_probe_arg(struct probe_arg *arg)
 {
-        if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+        if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
                free_deref_fetch_param(arg->fetch.data);
-        else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
                free_symbol_cache(arg->fetch.data);
        kfree(arg->name);
        kfree(arg->comm);
@@ -490,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp)
        }
        ret = register_probe_event(tp);
        if (ret) {
-                pr_warning("Faild to register probe event(%d)\n", ret);
+                pr_warning("Failed to register probe event(%d)\n", ret);
                goto end;
        }
@@ -548,7 +705,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
        if (strcmp(arg, "retval") == 0) {
                if (is_return)
-                        f->fn = t->retval;
+                        f->fn = t->fetch[FETCH_MTD_retval];
                else
                        ret = -EINVAL;
        } else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +719,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
                        if (ret || param > PARAM_MAX_STACK)
                                ret = -EINVAL;
                        else {
-                                f->fn = t->stack;
+                                f->fn = t->fetch[FETCH_MTD_stack];
                                f->data = (void *)param;
                        }
                } else
@@ -588,7 +745,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
        case '%':       /* named register */
                ret = regs_query_register_offset(arg + 1);
                if (ret >= 0) {
-                        f->fn = t->reg;
+                        f->fn = t->fetch[FETCH_MTD_reg];
                        f->data = (void *)(unsigned long)ret;
                        ret = 0;
                }
@@ -598,7 +755,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
                        ret = strict_strtoul(arg + 1, 0, &param);
                        if (ret)
                                break;
-                        f->fn = t->memory;
+                        f->fn = t->fetch[FETCH_MTD_memory];
                        f->data = (void *)param;
                } else {
                        ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +763,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
                                break;
                        f->data = alloc_symbol_cache(arg + 1, offset);
                        if (f->data)
-                                f->fn = t->symbol;
+                                f->fn = t->fetch[FETCH_MTD_symbol];
                }
                break;
        case '+':       /* deref memory */
@@ -636,14 +793,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
                        if (ret)
                                kfree(dprm);
                        else {
-                                f->fn = t->deref;
+                                f->fn = t->fetch[FETCH_MTD_deref];
                                f->data = (void *)dprm;
                        }
                }
                break;
        }
-        if (!ret && !f->fn)
+        if (!ret && !f->fn) {   /* Parsed, but do not find fetch method */
+                pr_info("%s type has no corresponding fetch method.\n",
+                        t->name);
                ret = -EINVAL;
+        }
        return ret;
 }
@@ -652,6 +812,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
                           struct probe_arg *parg, int is_return)
 {
        const char *t;
+        int ret;
        if (strlen(arg) > MAX_ARGSTR_LEN) {
                pr_info("Argument is too long.: %s\n",  arg);
@@ -674,7 +835,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
        }
        parg->offset = tp->size;
        tp->size += parg->type->size;
-        return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+        ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+        if (ret >= 0) {
+                parg->fetch_size.fn = get_fetch_size_function(parg->type,
+                                                              parg->fetch.fn);
+                parg->fetch_size.data = parg->fetch.data;
+        }
+        return ret;
 }
 /* Return 1 if name is reserved or already used by another argument */
@@ -715,7 +882,7 @@ static int create_trace_probe(int argc, char **argv)
        int i, ret = 0;
        int is_return = 0, is_delete = 0;
        char *symbol = NULL, *event = NULL, *group = NULL;
-        char *arg, *tmp;
+        char *arg;
        unsigned long offset = 0;
        void *addr = NULL;
        char buf[MAX_EVENT_NAME_LEN];
@@ -757,14 +924,17 @@ static int create_trace_probe(int argc, char **argv)
                        pr_info("Delete command needs an event name.\n");
                        return -EINVAL;
                }
+                mutex_lock(&probe_lock);
                tp = find_probe_event(event, group);
                if (!tp) {
+                        mutex_unlock(&probe_lock);
                        pr_info("Event %s/%s doesn't exist.\n", group, event);
                        return -ENOENT;
                }
                /* delete an event */
                unregister_trace_probe(tp);
                free_trace_probe(tp);
+                mutex_unlock(&probe_lock);
                return 0;
        }
@@ -821,26 +991,36 @@ static int create_trace_probe(int argc, char **argv)
        /* parse arguments */
        ret = 0;
        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+                /* Increment count for freeing args in error case */
+                tp->nr_args++;
                /* Parse argument name */
                arg = strchr(argv[i], '=');
-                if (arg)
+                if (arg) {
                        *arg++ = '\0';
-                else
+                        tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+                } else {
                        arg = argv[i];
+                        /* If argument name is omitted, set "argN" */
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+                        tp->args[i].name = kstrdup(buf, GFP_KERNEL);
+                }
-                tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
                if (!tp->args[i].name) {
-                        pr_info("Failed to allocate argument%d name '%s'.\n",
+                        pr_info("Failed to allocate argument[%d] name.\n", i);
-                                i, argv[i]);
                        ret = -ENOMEM;
                        goto error;
                }
-                tmp = strchr(tp->args[i].name, ':');
-                if (tmp)
+                if (!is_good_name(tp->args[i].name)) {
-                        *tmp = '_';     /* convert : to _ */
+                        pr_info("Invalid argument[%d] name: %s\n",
+                                i, tp->args[i].name);
+                        ret = -EINVAL;
+                        goto error;
+                }
                if (conflict_field_name(tp->args[i].name, tp->args, i)) {
-                        pr_info("Argument%d name '%s' conflicts with "
+                        pr_info("Argument[%d] name '%s' conflicts with "
                                "another field.\n", i, argv[i]);
                        ret = -EINVAL;
                        goto error;
@@ -849,12 +1029,9 @@ static int create_trace_probe(int argc, char **argv)
                /* Parse fetch argument */
                ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
                if (ret) {
-                        pr_info("Parse error at argument%d. (%d)\n", i, ret);
+                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
-                        kfree(tp->args[i].name);
                        goto error;
                }
-                tp->nr_args++;
        }
        ret = register_trace_probe(tp);
@@ -1043,6 +1220,54 @@ static const struct file_operations kprobe_profile_ops = {
        .release        = seq_release,
 };
+/* Sum up total data length for dynamic arraies (strings) */
+static __kprobes int __get_data_size(struct trace_probe *tp,
+                                     struct pt_regs *regs)
+{
+        int i, ret = 0;
+        u32 len;
+        for (i = 0; i < tp->nr_args; i++)
+                if (unlikely(tp->args[i].fetch_size.fn)) {
+                        call_fetch(&tp->args[i].fetch_size, regs, &len);
+                        ret += len;
+                }
+        return ret;
+}
+/* Store the value of each argument */
+static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
+                                       struct pt_regs *regs,
+                                       u8 *data, int maxlen)
+{
+        int i;
+        u32 end = tp->size;
+        u32 *dl;        /* Data (relative) location */
+        for (i = 0; i < tp->nr_args; i++) {
+                if (unlikely(tp->args[i].fetch_size.fn)) {
+                        /*
+                         * First, we set the relative location and
+                         * maximum data length to *dl
+                         */
+                        dl = (u32 *)(data + tp->args[i].offset);
+                        *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+                        /* Then try to fetch string or dynamic array data */
+                        call_fetch(&tp->args[i].fetch, regs, dl);
+                        /* Reduce maximum length */
+                        end += get_rloc_len(*dl);
+                        maxlen -= get_rloc_len(*dl);
+                        /* Trick here, convert data_rloc to data_loc */
+                        *dl = convert_rloc_to_loc(*dl,
+                                 ent_size + tp->args[i].offset);
+                } else
+                        /* Just fetching data normally */
+                        call_fetch(&tp->args[i].fetch, regs,
+                                   data + tp->args[i].offset);
+        }
+}
 /* Kprobe handler */
 static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
@@ -1050,8 +1275,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        struct kprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
-        u8 *data;
+        int size, dsize, pc;
-        int size, i, pc;
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tp->call;
@@ -1060,7 +1284,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        local_save_flags(irq_flags);
        pc = preempt_count();
-        size = sizeof(*entry) + tp->size;
+        dsize = __get_data_size(tp, regs);
+        size = sizeof(*entry) + tp->size + dsize;
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
                                                  size, irq_flags, pc);
@@ -1069,9 +1294,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        entry = ring_buffer_event_data(event);
        entry->ip = (unsigned long)kp->addr;
-        data = (u8 *)&entry[1];
+        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-        for (i = 0; i < tp->nr_args; i++)
-                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1308,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        struct kretprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
-        u8 *data;
+        int size, pc, dsize;
-        int size, i, pc;
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tp->call;
        local_save_flags(irq_flags);
        pc = preempt_count();
-        size = sizeof(*entry) + tp->size;
+        dsize = __get_data_size(tp, regs);
+        size = sizeof(*entry) + tp->size + dsize;
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
                                                  size, irq_flags, pc);
@@ -1103,9 +1326,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        entry = ring_buffer_event_data(event);
        entry->func = (unsigned long)tp->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
-        data = (u8 *)&entry[1];
+        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-        for (i = 0; i < tp->nr_args; i++)
-                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1358,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
        data = (u8 *)&field[1];
        for (i = 0; i < tp->nr_args; i++)
                if (!tp->args[i].type->print(s, tp->args[i].name,
-                                             data + tp->args[i].offset))
+                                             data + tp->args[i].offset, field))
                        goto partial;
        if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1400,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
        data = (u8 *)&field[1];
        for (i = 0; i < tp->nr_args; i++)
                if (!tp->args[i].type->print(s, tp->args[i].name,
-                                             data + tp->args[i].offset))
+                                             data + tp->args[i].offset, field))
                        goto partial;
        if (!trace_seq_puts(s, "\n"))
@@ -1214,11 +1435,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
        }
 }
-static int probe_event_raw_init(struct ftrace_event_call *event_call)
-{
-        return 0;
-}
 #undef DEFINE_FIELD
 #define DEFINE_FIELD(type, item, name, is_signed)                       \
        do {                                                            \
@@ -1239,7 +1455,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
        /* Set argument names as fields */
        for (i = 0; i < tp->nr_args; i++) {
-                ret = trace_define_field(event_call, tp->args[i].type->name,
+                ret = trace_define_field(event_call, tp->args[i].type->fmttype,
                                         tp->args[i].name,
                                         sizeof(field) + tp->args[i].offset,
                                         tp->args[i].type->size,
@@ -1261,7 +1477,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
        /* Set argument names as fields */
        for (i = 0; i < tp->nr_args; i++) {
-                ret = trace_define_field(event_call, tp->args[i].type->name,
+                ret = trace_define_field(event_call, tp->args[i].type->fmttype,
                                         tp->args[i].name,
                                         sizeof(field) + tp->args[i].offset,
                                         tp->args[i].type->size,
@@ -1301,8 +1517,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
        for (i = 0; i < tp->nr_args; i++) {
-                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+                if (strcmp(tp->args[i].type->name, "string") == 0)
-                                tp->args[i].name);
+                        pos += snprintf(buf + pos, LEN_OR_ZERO,
+                                        ", __get_str(%s)",
+                                        tp->args[i].name);
+                else
+                        pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+                                        tp->args[i].name);
        }
 #undef LEN_OR_ZERO
@@ -1339,11 +1560,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
        struct ftrace_event_call *call = &tp->call;
        struct kprobe_trace_entry_head *entry;
        struct hlist_head *head;
-        u8 *data;
+        int size, __size, dsize;
-        int size, __size, i;
        int rctx;
-        __size = sizeof(*entry) + tp->size;
+        dsize = __get_data_size(tp, regs);
+        __size = sizeof(*entry) + tp->size + dsize;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1355,9 +1576,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
                return;
        entry->ip = (unsigned long)kp->addr;
-        data = (u8 *)&entry[1];
+        memset(&entry[1], 0, dsize);
-        for (i = 0; i < tp->nr_args; i++)
+        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
        head = this_cpu_ptr(call->perf_events);
        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1371,11 +1591,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        struct ftrace_event_call *call = &tp->call;
        struct kretprobe_trace_entry_head *entry;
        struct hlist_head *head;
-        u8 *data;
+        int size, __size, dsize;
-        int size, __size, i;
        int rctx;
-        __size = sizeof(*entry) + tp->size;
+        dsize = __get_data_size(tp, regs);
+        __size = sizeof(*entry) + tp->size + dsize;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1388,9 +1608,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        entry->func = (unsigned long)tp->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
-        data = (u8 *)&entry[1];
+        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-        for (i = 0; i < tp->nr_args; i++)
-                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
        head = this_cpu_ptr(call->perf_events);
        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
@@ -1486,15 +1704,12 @@ static int register_probe_event(struct trace_probe *tp)
        int ret;
        /* Initialize ftrace_event_call */
+        INIT_LIST_HEAD(&call->class->fields);
        if (probe_is_return(tp)) {
-                INIT_LIST_HEAD(&call->class->fields);
                call->event.funcs = &kretprobe_funcs;
-                call->class->raw_init = probe_event_raw_init;
                call->class->define_fields = kretprobe_event_define_fields;
        } else {
-                INIT_LIST_HEAD(&call->class->fields);
                call->event.funcs = &kprobe_funcs;
-                call->class->raw_init = probe_event_raw_init;
                call->class->define_fields = kprobe_event_define_fields;
        }
        if (set_print_fmt(tp) < 0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- * trace_ksym.c - Kernel Symbol Tracer
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2009
- */
-#include <linux/kallsyms.h>
-#include <linux/uaccess.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include "trace_output.h"
-#include "trace.h"
-#include <linux/hw_breakpoint.h>
-#include <asm/hw_breakpoint.h>
-#include <asm/atomic.h>
-#define KSYM_TRACER_OP_LEN 3 /* rw- */
-struct trace_ksym {
-        struct perf_event       **ksym_hbp;
-        struct perf_event_attr  attr;
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-        atomic64_t              counter;
-#endif
-        struct hlist_node       ksym_hlist;
-};
-static struct trace_array *ksym_trace_array;
-static unsigned int ksym_tracing_enabled;
-static HLIST_HEAD(ksym_filter_head);
-static DEFINE_MUTEX(ksym_tracer_mutex);
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-#define MAX_UL_INT 0xffffffff
-void ksym_collect_stats(unsigned long hbp_hit_addr)
-{
-        struct hlist_node *node;
-        struct trace_ksym *entry;
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-                if (entry->attr.bp_addr == hbp_hit_addr) {
-                        atomic64_inc(&entry->counter);
-                        break;
-                }
-        }
-        rcu_read_unlock();
-}
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-void ksym_hbp_handler(struct perf_event *hbp, int nmi,
-                      struct perf_sample_data *data,
-                      struct pt_regs *regs)
-{
-        struct ring_buffer_event *event;
-        struct ksym_trace_entry *entry;
-        struct ring_buffer *buffer;
-        int pc;
-        if (!ksym_tracing_enabled)
-                return;
-        buffer = ksym_trace_array->buffer;
-        pc = preempt_count();
-        event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
-                                                        sizeof(*entry), 0, pc);
-        if (!event)
-                return;
-        entry           = ring_buffer_event_data(event);
-        entry->ip       = instruction_pointer(regs);
-        entry->type     = hw_breakpoint_type(hbp);
-        entry->addr     = hw_breakpoint_addr(hbp);
-        strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-        ksym_collect_stats(hw_breakpoint_addr(hbp));
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-        trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-/* Valid access types are represented as
- *
- * rw- : Set Read/Write Access Breakpoint
- * -w- : Set Write Access Breakpoint
- * --- : Clear Breakpoints
- * --x : Set Execution Break points (Not available yet)
- *
- */
-static int ksym_trace_get_access_type(char *str)
-{
-        int access = 0;
-        if (str[0] == 'r')
-                access |= HW_BREAKPOINT_R;
-        if (str[1] == 'w')
-                access |= HW_BREAKPOINT_W;
-        if (str[2] == 'x')
-                access |= HW_BREAKPOINT_X;
-        switch (access) {
-        case HW_BREAKPOINT_R:
-        case HW_BREAKPOINT_W:
-        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
-                return access;
-        default:
-                return -EINVAL;
-        }
-}
-/*
- * There can be several possible malformed requests and we attempt to capture
- * all of them. We enumerate some of the rules
- * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
- *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
- *    <module>:<ksym_name>:<op>.
- * 2. No delimiter symbol ':' in the input string
- * 3. Spurious operator symbols or symbols not in their respective positions
- * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
- * 5. Kernel symbol not a part of /proc/kallsyms
- * 6. Duplicate requests
- */
-static int parse_ksym_trace_str(char *input_string, char **ksymname,
-                                                        unsigned long *addr)
-{
-        int ret;
-        *ksymname = strsep(&input_string, ":");
-        *addr = kallsyms_lookup_name(*ksymname);
-        /* Check for malformed request: (2), (1) and (5) */
-        if ((!input_string) ||
-            (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
-            (*addr == 0))
-                return -EINVAL;;
-        ret = ksym_trace_get_access_type(input_string);
-        return ret;
-}
-int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
-{
-        struct trace_ksym *entry;
-        int ret = -ENOMEM;
-        entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
-        if (!entry)
-                return -ENOMEM;
-        hw_breakpoint_init(&entry->attr);
-        entry->attr.bp_type = op;
-        entry->attr.bp_addr = addr;
-        entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
-        entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
-                                        ksym_hbp_handler);
-        if (IS_ERR(entry->ksym_hbp)) {
-                ret = PTR_ERR(entry->ksym_hbp);
-                if (ret == -ENOSPC) {
-                        printk(KERN_ERR "ksym_tracer: Maximum limit reached."
-                        " No new requests for tracing can be accepted now.\n");
-                } else {
-                        printk(KERN_INFO "ksym_tracer request failed. Try again"
-                                         " later!!\n");
-                }
-                goto err;
-        }
-        hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-        return 0;
-err:
-        kfree(entry);
-        return ret;
-}
-static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
-                                                size_t count, loff_t *ppos)
-{
-        struct trace_ksym *entry;
-        struct hlist_node *node;
-        struct trace_seq *s;
-        ssize_t cnt = 0;
-        int ret;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
-        if (!s)
-                return -ENOMEM;
-        trace_seq_init(s);
-        mutex_lock(&ksym_tracer_mutex);
-        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-                ret = trace_seq_printf(s, "%pS:",
-                                (void *)(unsigned long)entry->attr.bp_addr);
-                if (entry->attr.bp_type == HW_BREAKPOINT_R)
-                        ret = trace_seq_puts(s, "r--\n");
-                else if (entry->attr.bp_type == HW_BREAKPOINT_W)
-                        ret = trace_seq_puts(s, "-w-\n");
-                else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
-                        ret = trace_seq_puts(s, "rw-\n");
-                WARN_ON_ONCE(!ret);
-        }
-        cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
-        mutex_unlock(&ksym_tracer_mutex);
-        kfree(s);
-        return cnt;
-}
-static void __ksym_trace_reset(void)
-{
-        struct trace_ksym *entry;
-        struct hlist_node *node, *node1;
-        mutex_lock(&ksym_tracer_mutex);
-        hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
-                                                                ksym_hlist) {
-                unregister_wide_hw_breakpoint(entry->ksym_hbp);
-                hlist_del_rcu(&(entry->ksym_hlist));
-                synchronize_rcu();
-                kfree(entry);
-        }
-        mutex_unlock(&ksym_tracer_mutex);
-}
-static ssize_t ksym_trace_filter_write(struct file *file,
-                                        const char __user *buffer,
-                                                size_t count, loff_t *ppos)
-{
-        struct trace_ksym *entry;
-        struct hlist_node *node;
-        char *buf, *input_string, *ksymname = NULL;
-        unsigned long ksym_addr = 0;
-        int ret, op, changed = 0;
-        buf = kzalloc(count + 1, GFP_KERNEL);
-        if (!buf)
-                return -ENOMEM;
-        ret = -EFAULT;
-        if (copy_from_user(buf, buffer, count))
-                goto out;
-        buf[count] = '\0';
-        input_string = strstrip(buf);
-        /*
-         * Clear all breakpoints if:
-         * 1: echo > ksym_trace_filter
-         * 2: echo 0 > ksym_trace_filter
-         * 3: echo "*:---" > ksym_trace_filter
-         */
-        if (!input_string[0] || !strcmp(input_string, "0") ||
-            !strcmp(input_string, "*:---")) {
-                __ksym_trace_reset();
-                ret = 0;
-                goto out;
-        }
-        ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
-        if (ret < 0)
-                goto out;
-        mutex_lock(&ksym_tracer_mutex);
-        ret = -EINVAL;
-        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-                if (entry->attr.bp_addr == ksym_addr) {
-                        /* Check for malformed request: (6) */
-                        if (entry->attr.bp_type != op)
-                                changed = 1;
-                        else
-                                goto out_unlock;
-                        break;
-                }
-        }
-        if (changed) {
-                unregister_wide_hw_breakpoint(entry->ksym_hbp);
-                entry->attr.bp_type = op;
-                ret = 0;
-                if (op > 0) {
-                        entry->ksym_hbp =
-                                register_wide_hw_breakpoint(&entry->attr,
-                                        ksym_hbp_handler);
-                        if (IS_ERR(entry->ksym_hbp))
-                                ret = PTR_ERR(entry->ksym_hbp);
-                        else
-                                goto out_unlock;
-                }
-                /* Error or "symbol:---" case: drop it */
-                hlist_del_rcu(&(entry->ksym_hlist));
-                synchronize_rcu();
-                kfree(entry);
-                goto out_unlock;
-        } else {
-                /* Check for malformed request: (4) */
-                if (op)
-                        ret = process_new_ksym_entry(ksymname, op, ksym_addr);
-        }
-out_unlock:
-        mutex_unlock(&ksym_tracer_mutex);
-out:
-        kfree(buf);
-        return !ret ? count : ret;
-}
-static const struct file_operations ksym_tracing_fops = {
-        .open           = tracing_open_generic,
-        .read           = ksym_trace_filter_read,
-        .write          = ksym_trace_filter_write,
-};
-static void ksym_trace_reset(struct trace_array *tr)
-{
-        ksym_tracing_enabled = 0;
-        __ksym_trace_reset();
-}
-static int ksym_trace_init(struct trace_array *tr)
-{
-        int cpu, ret = 0;
-        for_each_online_cpu(cpu)
-                tracing_reset(tr, cpu);
-        ksym_tracing_enabled = 1;
-        ksym_trace_array = tr;
-        return ret;
-}
-static void ksym_trace_print_header(struct seq_file *m)
-{
-        seq_puts(m,
-                 "#       TASK-PID   CPU#      Symbol                    "
-                 "Type    Function\n");
-        seq_puts(m,
-                 "#          |        |          |                       "
-                 " |         |\n");
-}
-static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
-{
-        struct trace_entry *entry = iter->ent;
-        struct trace_seq *s = &iter->seq;
-        struct ksym_trace_entry *field;
-        char str[KSYM_SYMBOL_LEN];
-        int ret;
-        if (entry->type != TRACE_KSYM)
-                return TRACE_TYPE_UNHANDLED;
-        trace_assign_type(field, entry);
-        ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
-                                entry->pid, iter->cpu, (char *)field->addr);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        switch (field->type) {
-        case HW_BREAKPOINT_R:
-                ret = trace_seq_printf(s, " R  ");
-                break;
-        case HW_BREAKPOINT_W:
-                ret = trace_seq_printf(s, " W  ");
-                break;
-        case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-                ret = trace_seq_printf(s, " RW ");
-                break;
-        default:
-                return TRACE_TYPE_PARTIAL_LINE;
-        }
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        sprint_symbol(str, field->ip);
-        ret = trace_seq_printf(s, "%s\n", str);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
-}
-struct tracer ksym_tracer __read_mostly =
-{
-        .name           = "ksym_tracer",
-        .init           = ksym_trace_init,
-        .reset          = ksym_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-        .selftest       = trace_selftest_startup_ksym,
-#endif
-        .print_header   = ksym_trace_print_header,
-        .print_line     = ksym_trace_output
-};
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-static int ksym_profile_show(struct seq_file *m, void *v)
-{
-        struct hlist_node *node;
-        struct trace_ksym *entry;
-        int access_type = 0;
-        char fn_name[KSYM_NAME_LEN];
-        seq_puts(m, "  Access Type ");
-        seq_puts(m, "  Symbol                                       Counter\n");
-        seq_puts(m, "  ----------- ");
-        seq_puts(m, "  ------                                       -------\n");
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-                access_type = entry->attr.bp_type;
-                switch (access_type) {
-                case HW_BREAKPOINT_R:
-                        seq_puts(m, "  R           ");
-                        break;
-                case HW_BREAKPOINT_W:
-                        seq_puts(m, "  W           ");
-                        break;
-                case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-                        seq_puts(m, "  RW          ");
-                        break;
-                default:
-                        seq_puts(m, "  NA          ");
-                }
-                if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
-                        seq_printf(m, "  %-36s", fn_name);
-                else
-                        seq_printf(m, "  %-36s", "<NA>");
-                seq_printf(m, " %15llu\n",
-                           (unsigned long long)atomic64_read(&entry->counter));
-        }
-        rcu_read_unlock();
-        return 0;
-}
-static int ksym_profile_open(struct inode *node, struct file *file)
-{
-        return single_open(file, ksym_profile_show, NULL);
-}
-static const struct file_operations ksym_profile_fops = {
-        .open           = ksym_profile_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-__init static int init_ksym_trace(void)
-{
-        struct dentry *d_tracer;
-        d_tracer = tracing_init_dentry();
-        trace_create_file("ksym_trace_filter", 0644, d_tracer,
-                          NULL, &ksym_tracing_fops);
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-        trace_create_file("ksym_profile", 0444, d_tracer,
-                          NULL, &ksym_profile_fops);
-#endif
-        return register_tracer(&ksym_tracer);
-}
-device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
 DECLARE_RWSEM(trace_event_mutex);
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = {
        .funcs          = &trace_wake_funcs,
 };
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
-                                             int flags, struct trace_event *event)
-{
-        struct special_entry *field;
-        trace_assign_type(field, iter->ent);
-        if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
-                              field->arg1,
-                              field->arg2,
-                              field->arg3))
-                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-                                           int flags, struct trace_event *event)
-{
-        struct special_entry *field;
-        struct trace_seq *s = &iter->seq;
-        trace_assign_type(field, iter->ent);
-        SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-        SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-        SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-        return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-                                           int flags, struct trace_event *event)
-{
-        struct special_entry *field;
-        struct trace_seq *s = &iter->seq;
-        trace_assign_type(field, iter->ent);
-        SEQ_PUT_FIELD_RET(s, field->arg1);
-        SEQ_PUT_FIELD_RET(s, field->arg2);
-        SEQ_PUT_FIELD_RET(s, field->arg3);
-        return TRACE_TYPE_HANDLED;
-}
-static struct trace_event_functions trace_special_funcs = {
-        .trace          = trace_special_print,
-        .raw            = trace_special_print,
-        .hex            = trace_special_hex,
-        .binary         = trace_special_bin,
-};
-static struct trace_event trace_special_event = {
-        .type           = TRACE_SPECIAL,
-        .funcs          = &trace_special_funcs,
-};
 /* TRACE_STACK */
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 static struct trace_event_functions trace_stack_funcs = {
        .trace          = trace_stack_print,
-        .raw            = trace_special_print,
-        .hex            = trace_special_hex,
-        .binary         = trace_special_bin,
 };
 static struct trace_event trace_stack_event = {
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 static struct trace_event_functions trace_user_stack_funcs = {
        .trace          = trace_user_stack_print,
-        .raw            = trace_special_print,
-        .hex            = trace_special_hex,
-        .binary         = trace_special_bin,
 };
 static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = {
        &trace_fn_event,
        &trace_ctx_event,
        &trace_wake_event,
-        &trace_special_event,
        &trace_stack_event,
        &trace_user_stack_event,
        &trace_bprint_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,50 +31,99 @@ static int			wakeup_rt;
 static arch_spinlock_t wakeup_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static void wakeup_reset(struct trace_array *tr);
 static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 static int save_lat_flag;
+#define TRACE_DISPLAY_GRAPH     1
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        /* display latency trace as call graph */
+        { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+        { } /* Empty entry */
+};
+static struct tracer_flags tracer_flags = {
+        .val  = 0,
+        .opts = trace_opts,
+};
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
 */
-static void
+static int
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+func_prolog_preempt_disable(struct trace_array *tr,
+                            struct trace_array_cpu **data,
+                            int *pc)
 {
-        struct trace_array *tr = wakeup_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
        long disabled;
-        int resched;
        int cpu;
-        int pc;
        if (likely(!wakeup_task))
-                return;
+                return 0;
-        pc = preempt_count();
+        *pc = preempt_count();
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
        if (cpu != wakeup_current_cpu)
                goto out_enable;
-        data = tr->data[cpu];
+        *data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
+        disabled = atomic_inc_return(&(*data)->disabled);
        if (unlikely(disabled != 1))
                goto out;
-        local_irq_save(flags);
+        return 1;
-        trace_function(tr, ip, parent_ip, flags, pc);
+out:
+        atomic_dec(&(*data)->disabled);
+out_enable:
+        preempt_enable_notrace();
+        return 0;
+}
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return;
+        local_irq_save(flags);
+        trace_function(tr, ip, parent_ip, flags, pc);
        local_irq_restore(flags);
- out:
        atomic_dec(&data->disabled);
- out_enable:
+        preempt_enable_notrace();
-        ftrace_preempt_enable(resched);
 }
 static struct ftrace_ops trace_ops __read_mostly =
@@ -83,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
+static int start_func_tracer(int graph)
+{
+        int ret;
+        if (!graph)
+                ret = register_ftrace_function(&trace_ops);
+        else
+                ret = register_ftrace_graph(&wakeup_graph_return,
+                                            &wakeup_graph_entry);
+        if (!ret && tracing_is_enabled())
+                tracer_enabled = 1;
+        else
+                tracer_enabled = 0;
+        return ret;
+}
+static void stop_func_tracer(int graph)
+{
+        tracer_enabled = 0;
+        if (!graph)
+                unregister_ftrace_function(&trace_ops);
+        else
+                unregister_ftrace_graph();
+}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+        if (!(bit & TRACE_DISPLAY_GRAPH))
+                return -EINVAL;
+        if (!(is_graph() ^ set))
+                return 0;
+        stop_func_tracer(!set);
+        wakeup_reset(wakeup_trace);
+        tracing_max_latency = 0;
+        return start_func_tracer(set);
+}
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc, ret = 0;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return 0;
+        local_save_flags(flags);
+        ret = __trace_graph_entry(tr, trace, flags, pc);
+        atomic_dec(&data->disabled);
+        preempt_enable_notrace();
+        return ret;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return;
+        local_save_flags(flags);
+        __trace_graph_return(tr, trace, flags, pc);
+        atomic_dec(&data->disabled);
+        preempt_enable_notrace();
+        return;
+}
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+        if (is_graph())
+                graph_trace_open(iter);
+}
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+        if (iter->private)
+                graph_trace_close(iter);
+}
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+        /*
+         * In graph mode call the graph tracer output function,
+         * otherwise go with the TRACE_FN event handler
+         */
+        if (is_graph())
+                return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+        return TRACE_TYPE_UNHANDLED;
+}
+static void wakeup_print_header(struct seq_file *s)
+{
+        if (is_graph())
+                print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+        else
+                trace_default_header(s);
+}
+static void
+__trace_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long parent_ip,
+                 unsigned long flags, int pc)
+{
+        if (is_graph())
+                trace_graph_function(tr, ip, parent_ip, flags, pc);
+        else
+                trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+        return -EINVAL;
+}
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+        return -1;
+}
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+        return TRACE_TYPE_UNHANDLED;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
 * Should this new latency be reported/recorded?
 */
@@ -153,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
        /* The task we are waiting for is waking up */
        data = wakeup_trace->data[wakeup_cpu];
-        trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+        __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
        T0 = data->preempt_timestamp;
@@ -253,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
         * is not called by an assembly function  (where as schedule is)
         * it should be safe to use it here.
         */
-        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+        __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 out_locked:
        arch_spin_unlock(&wakeup_lock);
@@ -304,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
         */
        smp_wmb();
-        register_ftrace_function(&trace_ops);
+        if (start_func_tracer(is_graph()))
+                printk(KERN_ERR "failed to start wakeup tracer\n");
-        if (tracing_is_enabled())
-                tracer_enabled = 1;
-        else
-                tracer_enabled = 0;
        return;
 fail_deprobe_wake_new:
@@ -321,7 +516,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
-        unregister_ftrace_function(&trace_ops);
+        stop_func_tracer(is_graph());
        unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
        unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -380,9 +575,16 @@ static struct tracer wakeup_tracer __read_mostly =
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
        .print_max      = 1,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
+        .use_max_tr     = 1,
 };
 static struct tracer wakeup_rt_tracer __read_mostly =
@@ -394,9 +596,16 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .stop           = wakeup_tracer_stop,
        .wait_pipe      = poll_wait_pipe,
        .print_max      = 1,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
+        .use_max_tr     = 1,
 };
 __init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_WAKE:
        case TRACE_STACK:
        case TRACE_PRINT:
-        case TRACE_SPECIAL:
        case TRACE_BRANCH:
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
-        case TRACE_KSYM:
                return 1;
        }
        return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 }
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-#ifdef CONFIG_SYSPROF_TRACER
-int
-trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
-{
-        unsigned long count;
-        int ret;
-        /* start the tracing */
-        ret = tracer_init(trace, tr);
-        if (ret) {
-                warn_failed_init_tracer(trace, ret);
-                return ret;
-        }
-        /* Sleep for a 1/10 of a second */
-        msleep(100);
-        /* stop the tracing. */
-        tracing_stop();
-        /* check the trace buffer */
-        ret = trace_test_buffer(tr, &count);
-        trace->reset(tr);
-        tracing_start();
-        if (!ret && !count) {
-                printk(KERN_CONT ".. no entries found ..");
-                ret = -1;
-        }
-        return ret;
-}
-#endif /* CONFIG_SYSPROF_TRACER */
 #ifdef CONFIG_BRANCH_TRACER
 int
 trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
-#ifdef CONFIG_KSYM_TRACER
-static int ksym_selftest_dummy;
-int
-trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
-{
-        unsigned long count;
-        int ret;
-        /* start the tracing */
-        ret = tracer_init(trace, tr);
-        if (ret) {
-                warn_failed_init_tracer(trace, ret);
-                return ret;
-        }
-        ksym_selftest_dummy = 0;
-        /* Register the read-write tracing request */
-        ret = process_new_ksym_entry("ksym_selftest_dummy",
-                                     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
-                                        (unsigned long)(&ksym_selftest_dummy));
-        if (ret < 0) {
-                printk(KERN_CONT "ksym_trace read-write startup test failed\n");
-                goto ret_path;
-        }
-        /* Perform a read and a write operation over the dummy variable to
-         * trigger the tracer
-         */
-        if (ksym_selftest_dummy == 0)
-                ksym_selftest_dummy++;
-        /* stop the tracing. */
-        tracing_stop();
-        /* check the trace buffer */
-        ret = trace_test_buffer(tr, &count);
-        trace->reset(tr);
-        tracing_start();
-        /* read & write operations - one each is performed on the dummy variable
-         * triggering two entries in the trace buffer
-         */
-        if (!ret && count != 2) {
-                printk(KERN_CONT "Ksym tracer startup test failed");
-                ret = -1;
-        }
-ret_path:
-        return ret;
-}
-#endif /* CONFIG_KSYM_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
 static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip)
 {
-        int cpu, resched;
+        int cpu;
        if (unlikely(!ftrace_enabled || stack_trace_disabled))
                return;
-        resched = ftrace_preempt_disable();
+        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
        /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 out:
        per_cpu(trace_active, cpu)--;
        /* prevent recursion in schedule */
-        ftrace_preempt_enable(resched);
+        preempt_enable_notrace();
 }
 static struct ftrace_ops trace_ops __read_mostly =
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
        .open           = tracing_open_generic,
        .read           = stack_max_size_read,
        .write          = stack_max_size_write,
+        .llseek         = default_llseek,
 };
 static void *
@@ -249,7 +250,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
 {
        unsigned long addr = stack_dump_trace[i];
-        return seq_printf(m, "%pF\n", (void *)addr);
+        return seq_printf(m, "%pS\n", (void *)addr);
 }
 static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
 static struct list_head *
 syscall_get_exit_fields(struct ftrace_event_call *call)
 {
-        struct syscall_metadata *entry = call->data;
+        return &syscall_exit_fields;
-        return &entry->exit_fields;
 }
 struct trace_event_functions enter_syscall_print_funcs = {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * trace stack traces
- *
- * Copyright (C) 2004-2008, Soeren Sandmann
- * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
- * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/hrtimer.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/irq.h>
-#include <linux/fs.h>
-#include <asm/stacktrace.h>
-#include "trace.h"
-static struct trace_array       *sysprof_trace;
-static int __read_mostly        tracer_enabled;
-/*
- * 1 msec sample interval by default:
- */
-static unsigned long sample_period = 1000000;
-static const unsigned int sample_max_depth = 512;
-static DEFINE_MUTEX(sample_timer_lock);
-/*
- * Per CPU hrtimers that do the profiling:
- */
-static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
-struct stack_frame {
-        const void __user       *next_fp;
-        unsigned long           return_address;
-};
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
-        int ret;
-        if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
-                return 0;
-        ret = 1;
-        pagefault_disable();
-        if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
-                ret = 0;
-        pagefault_enable();
-        return ret;
-}
-struct backtrace_info {
-        struct trace_array_cpu  *data;
-        struct trace_array      *tr;
-        int                     pos;
-};
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-        /* Ignore warnings */
-}
-static void backtrace_warning(void *data, char *msg)
-{
-        /* Ignore warnings */
-}
-static int backtrace_stack(void *data, char *name)
-{
-        /* Don't bother with IRQ stacks for now */
-        return -1;
-}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-        struct backtrace_info *info = data;
-        if (info->pos < sample_max_depth && reliable) {
-                __trace_special(info->tr, info->data, 1, addr, 0);
-                info->pos++;
-        }
-}
-static const struct stacktrace_ops backtrace_ops = {
-        .warning                = backtrace_warning,
-        .warning_symbol         = backtrace_warning_symbol,
-        .stack                  = backtrace_stack,
-        .address                = backtrace_address,
-        .walk_stack             = print_context_stack,
-};
-static int
-trace_kernel(struct pt_regs *regs, struct trace_array *tr,
-             struct trace_array_cpu *data)
-{
-        struct backtrace_info info;
-        unsigned long bp;
-        char *stack;
-        info.tr = tr;
-        info.data = data;
-        info.pos = 1;
-        __trace_special(info.tr, info.data, 1, regs->ip, 0);
-        stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
-        bp = regs->bp;
-#else
-        bp = 0;
-#endif
-        dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
-        return info.pos;
-}
-static void timer_notify(struct pt_regs *regs, int cpu)
-{
-        struct trace_array_cpu *data;
-        struct stack_frame frame;
-        struct trace_array *tr;
-        const void __user *fp;
-        int is_user;
-        int i;
-        if (!regs)
-                return;
-        tr = sysprof_trace;
-        data = tr->data[cpu];
-        is_user = user_mode(regs);
-        if (!current || current->pid == 0)
-                return;
-        if (is_user && current->state != TASK_RUNNING)
-                return;
-        __trace_special(tr, data, 0, 0, current->pid);
-        if (!is_user)
-                i = trace_kernel(regs, tr, data);
-        else
-                i = 0;
-        /*
-         * Trace user stack if we are not a kernel thread
-         */
-        if (current->mm && i < sample_max_depth) {
-                regs = (struct pt_regs *)current->thread.sp0 - 1;
-                fp = (void __user *)regs->bp;
-                __trace_special(tr, data, 2, regs->ip, 0);
-                while (i < sample_max_depth) {
-                        frame.next_fp = NULL;
-                        frame.return_address = 0;
-                        if (!copy_stack_frame(fp, &frame))
-                                break;
-                        if ((unsigned long)fp < regs->sp)
-                                break;
-                        __trace_special(tr, data, 2, frame.return_address,
-                                        (unsigned long)fp);
-                        fp = frame.next_fp;
-                        i++;
-                }
-        }
-        /*
-         * Special trace entry if we overflow the max depth:
-         */
-        if (i == sample_max_depth)
-                __trace_special(tr, data, -1, -1, -1);
-        __trace_special(tr, data, 3, current->pid, i);
-}
-static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-{
-        /* trace here */
-        timer_notify(get_irq_regs(), smp_processor_id());
-        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
-        return HRTIMER_RESTART;
-}
-static void start_stack_timer(void *unused)
-{
-        struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
-        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hrtimer->function = stack_trace_timer_fn;
-        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-                      HRTIMER_MODE_REL_PINNED);
-}
-static void start_stack_timers(void)
-{
-        on_each_cpu(start_stack_timer, NULL, 1);
-}
-static void stop_stack_timer(int cpu)
-{
-        struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
-        hrtimer_cancel(hrtimer);
-}
-static void stop_stack_timers(void)
-{
-        int cpu;
-        for_each_online_cpu(cpu)
-                stop_stack_timer(cpu);
-}
-static void stop_stack_trace(struct trace_array *tr)
-{
-        mutex_lock(&sample_timer_lock);
-        stop_stack_timers();
-        tracer_enabled = 0;
-        mutex_unlock(&sample_timer_lock);
-}
-static int stack_trace_init(struct trace_array *tr)
-{
-        sysprof_trace = tr;
-        tracing_start_cmdline_record();
-        mutex_lock(&sample_timer_lock);
-        start_stack_timers();
-        tracer_enabled = 1;
-        mutex_unlock(&sample_timer_lock);
-        return 0;
-}
-static void stack_trace_reset(struct trace_array *tr)
-{
-        tracing_stop_cmdline_record();
-        stop_stack_trace(tr);
-}
-static struct tracer stack_trace __read_mostly =
-{
-        .name           = "sysprof",
-        .init           = stack_trace_init,
-        .reset          = stack_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-        .selftest    = trace_selftest_startup_sysprof,
-#endif
-};
-__init static int init_stack_trace(void)
-{
-        return register_tracer(&stack_trace);
-}
-device_initcall(init_stack_trace);
-#define MAX_LONG_DIGITS 22
-static ssize_t
-sysprof_sample_read(struct file *filp, char __user *ubuf,
-                    size_t cnt, loff_t *ppos)
-{
-        char buf[MAX_LONG_DIGITS];
-        int r;
-        r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-static ssize_t
-sysprof_sample_write(struct file *filp, const char __user *ubuf,
-                     size_t cnt, loff_t *ppos)
-{
-        char buf[MAX_LONG_DIGITS];
-        unsigned long val;
-        if (cnt > MAX_LONG_DIGITS-1)
-                cnt = MAX_LONG_DIGITS-1;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        val = simple_strtoul(buf, NULL, 10);
-        /*
-         * Enforce a minimum sample period of 100 usecs:
-         */
-        if (val < 100)
-                val = 100;
-        mutex_lock(&sample_timer_lock);
-        stop_stack_timers();
-        sample_period = val * 1000;
-        start_stack_timers();
-        mutex_unlock(&sample_timer_lock);
-        return cnt;
-}
-static const struct file_operations sysprof_sample_fops = {
-        .read           = sysprof_sample_read,
-        .write          = sysprof_sample_write,
-};
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
-{
-        trace_create_file("sysprof_sample_period", 0644,
-                        d_tracer, NULL, &sysprof_sample_fops);
-}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
 {
        int ret, cpu;
+        for_each_possible_cpu(cpu) {
+                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+        }
        ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
        if (ret)
                goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
        if (ret)
                goto no_creation;
-        for_each_possible_cpu(cpu) {
-                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-        }
        return 0;
 no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..e95ee7f31d43 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/jump_label.h>
 extern struct tracepoint __start___tracepoints[];
 extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
         * is used.
         */
        rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-        elem->state = active;
+        if (!elem->state && active) {
+                jump_label_enable(&elem->state);
+                elem->state = active;
+        } else if (elem->state && !active) {
+                jump_label_disable(&elem->state);
+                elem->state = active;
+        }
 }
 /*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
        if (elem->unregfunc && elem->state)
                elem->unregfunc();
-        elem->state = 0;
+        if (elem->state) {
+                jump_label_disable(&elem->state);
+                elem->state = 0;
+        }
        rcu_assign_pointer(elem->funcs, NULL);
 }
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
        stats->ac_ppid   = pid_alive(tsk) ?
                                rcu_dereference(tsk->real_parent)->tgid : 0;
        rcu_read_unlock();
-        stats->ac_utime  = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+        stats->ac_utime = cputime_to_usecs(tsk->utime);
-        stats->ac_stime  = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+        stats->ac_stime = cputime_to_usecs(tsk->stime);
-        stats->ac_utimescaled =
+        stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
-                cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+        stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
-        stats->ac_stimescaled =
-                cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
        stats->ac_minflt = tsk->min_flt;
        stats->ac_majflt = tsk->maj_flt;
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 * upon function exit.
 */
 static void free_user(struct user_struct *up, unsigned long flags)
+        __releases(&uidhash_lock)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/highuid.h>
 #include <linux/cred.h>
 /*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
        schedule_work(&ns->destroyer);
 }
 EXPORT_SYMBOL(free_user_ns);
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+        struct user_namespace *tmp;
+        if (likely(to == cred->user->user_ns))
+                return uid;
+        /* Is cred->user the creator of the target user_ns
+         * or the creator of one of it's parents?
+         */
+        for ( tmp = to; tmp != &init_user_ns;
+              tmp = tmp->creator->user_ns ) {
+                if (cred->user == tmp->creator) {
+                        return (uid_t)0;
+                }
+        }
+        /* No useful relationship so no mapping */
+        return overflowuid;
+}
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+        struct user_namespace *tmp;
+        if (likely(to == cred->user->user_ns))
+                return gid;
+        /* Is cred->user the creator of the target user_ns
+         * or the creator of one of it's parents?
+         */
+        for ( tmp = to; tmp != &init_user_ns;
+              tmp = tmp->creator->user_ns ) {
+                if (cred->user == tmp->creator) {
+                        return (gid_t)0;
+                }
+        }
+        /* No useful relationship so no mapping */
+        return overflowgid;
+}
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
-/*
+/**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on
 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
-/*
+/**
 * abort_exclusive_wait - abort exclusive waiting in a queue
 * @q: waitqueue waited on
 * @wait: wait descriptor
- * @state: runstate of the waiter to be woken
+ * @mode: runstate of the waiter to be woken
 * @key: key to identify a wait bit queue or %NULL
 *
 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..bafba687a6d8
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,566 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+int watchdog_enabled;
+int __read_mostly softlockup_thresh = 60;
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+static int __initdata no_watchdog;
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int hardlockup_panic;
+static int __init hardlockup_panic_setup(char *str)
+{
+        if (!strncmp(str, "panic", 5))
+                hardlockup_panic = 1;
+        return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+unsigned int __read_mostly softlockup_panic =
+                        CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+static int __init softlockup_panic_setup(char *str)
+{
+        softlockup_panic = simple_strtoul(str, NULL, 0);
+        return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+static int __init nowatchdog_setup(char *str)
+{
+        no_watchdog = 1;
+        return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+/* deprecated */
+static int __init nosoftlockup_setup(char *str)
+{
+        no_watchdog = 1;
+        return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+/*  */
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+        return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+static unsigned long get_sample_period(void)
+{
+        /*
+         * convert softlockup_thresh from seconds to ns
+         * the divide by 5 is to give hrtimer 5 chances to
+         * increment before the hardlockup detector generates
+         * a warning
+         */
+        return softlockup_thresh / 5 * NSEC_PER_SEC;
+}
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+        int this_cpu = smp_processor_id();
+        __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+}
+void touch_softlockup_watchdog(void)
+{
+        __raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_all_softlockup_watchdogs(void)
+{
+        int cpu;
+        /*
+         * this is done lockless
+         * do we care if a 0 races with a timestamp?
+         * all it means is the softlock check starts one cycle later
+         */
+        for_each_online_cpu(cpu)
+                per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void touch_nmi_watchdog(void)
+{
+        if (watchdog_enabled) {
+                unsigned cpu;
+                for_each_present_cpu(cpu) {
+                        if (per_cpu(watchdog_nmi_touch, cpu) != true)
+                                per_cpu(watchdog_nmi_touch, cpu) = true;
+                }
+        }
+        touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+#endif
+void touch_softlockup_watchdog_sync(void)
+{
+        __raw_get_cpu_var(softlockup_touch_sync) = true;
+        __raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* watchdog detector functions */
+static int is_hardlockup(void)
+{
+        unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+        if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+                return 1;
+        __get_cpu_var(hrtimer_interrupts_saved) = hrint;
+        return 0;
+}
+#endif
+static int is_softlockup(unsigned long touch_ts)
+{
+        unsigned long now = get_timestamp(smp_processor_id());
+        /* Warn about unreasonable delays: */
+        if (time_after(now, touch_ts + softlockup_thresh))
+                return now - touch_ts;
+        return 0;
+}
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static struct perf_event_attr wd_hw_attr = {
+        .type           = PERF_TYPE_HARDWARE,
+        .config         = PERF_COUNT_HW_CPU_CYCLES,
+        .size           = sizeof(struct perf_event_attr),
+        .pinned         = 1,
+        .disabled       = 1,
+};
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
+                 struct perf_sample_data *data,
+                 struct pt_regs *regs)
+{
+        /* Ensure the watchdog never gets throttled */
+        event->hw.interrupts = 0;
+        if (__get_cpu_var(watchdog_nmi_touch) == true) {
+                __get_cpu_var(watchdog_nmi_touch) = false;
+                return;
+        }
+        /* check for a hardlockup
+         * This is done by making sure our timer interrupt
+         * is incrementing.  The timer interrupt should have
+         * fired multiple times before we overflow'd.  If it hasn't
+         * then this is a good indication the cpu is stuck
+         */
+        if (is_hardlockup()) {
+                int this_cpu = smp_processor_id();
+                /* only print hardlockups once */
+                if (__get_cpu_var(hard_watchdog_warn) == true)
+                        return;
+                if (hardlockup_panic)
+                        panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                else
+                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                __get_cpu_var(hard_watchdog_warn) = true;
+                return;
+        }
+        __get_cpu_var(hard_watchdog_warn) = false;
+        return;
+}
+static void watchdog_interrupt_count(void)
+{
+        __get_cpu_var(hrtimer_interrupts)++;
+}
+#else
+static inline void watchdog_interrupt_count(void) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+        unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+        struct pt_regs *regs = get_irq_regs();
+        int duration;
+        /* kick the hardlockup detector */
+        watchdog_interrupt_count();
+        /* kick the softlockup detector */
+        wake_up_process(__get_cpu_var(softlockup_watchdog));
+        /* .. and repeat */
+        hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+        if (touch_ts == 0) {
+                if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+                        /*
+                         * If the time stamp was touched atomically
+                         * make sure the scheduler tick is up to date.
+                         */
+                        __get_cpu_var(softlockup_touch_sync) = false;
+                        sched_clock_tick();
+                }
+                __touch_watchdog();
+                return HRTIMER_RESTART;
+        }
+        /* check for a softlockup
+         * This is done by making sure a high priority task is
+         * being scheduled.  The task touches the watchdog to
+         * indicate it is getting cpu time.  If it hasn't then
+         * this is a good indication some task is hogging the cpu
+         */
+        duration = is_softlockup(touch_ts);
+        if (unlikely(duration)) {
+                /* only warn once */
+                if (__get_cpu_var(soft_watchdog_warn) == true)
+                        return HRTIMER_RESTART;
+                printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+                        smp_processor_id(), duration,
+                        current->comm, task_pid_nr(current));
+                print_modules();
+                print_irqtrace_events(current);
+                if (regs)
+                        show_regs(regs);
+                else
+                        dump_stack();
+                if (softlockup_panic)
+                        panic("softlockup: hung tasks");
+                __get_cpu_var(soft_watchdog_warn) = true;
+        } else
+                __get_cpu_var(soft_watchdog_warn) = false;
+        return HRTIMER_RESTART;
+}
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *unused)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        sched_setscheduler(current, SCHED_FIFO, &param);
+        /* initialize timestamp */
+        __touch_watchdog();
+        /* kick off the timer for the hardlockup detector */
+        /* done here because hrtimer_start can only pin to smp_processor_id() */
+        hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+                      HRTIMER_MODE_REL_PINNED);
+        set_current_state(TASK_INTERRUPTIBLE);
+        /*
+         * Run briefly once per second to reset the softlockup timestamp.
+         * If this gets delayed for more than 60 seconds then the
+         * debug-printout triggers in watchdog_timer_fn().
+         */
+        while (!kthread_should_stop()) {
+                __touch_watchdog();
+                schedule();
+                if (kthread_should_stop())
+                        break;
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
+        __set_current_state(TASK_RUNNING);
+        return 0;
+}
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int watchdog_nmi_enable(int cpu)
+{
+        struct perf_event_attr *wd_attr;
+        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        /* is it already setup and enabled? */
+        if (event && event->state > PERF_EVENT_STATE_OFF)
+                goto out;
+        /* it is setup but not enabled */
+        if (event != NULL)
+                goto out_enable;
+        /* Try to register using hardware perf events */
+        wd_attr = &wd_hw_attr;
+        wd_attr->sample_period = hw_nmi_get_sample_period();
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
+        if (!IS_ERR(event)) {
+                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+                goto out_save;
+        }
+        printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+        return PTR_ERR(event);
+        /* success path */
+out_save:
+        per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+        perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+        return 0;
+}
+static void watchdog_nmi_disable(int cpu)
+{
+        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        if (event) {
+                perf_event_disable(event);
+                per_cpu(watchdog_ev, cpu) = NULL;
+                /* should be in cleanup, but blocks oprofile */
+                perf_event_release_kernel(event);
+        }
+        return;
+}
+#else
+static int watchdog_nmi_enable(int cpu) { return 0; }
+static void watchdog_nmi_disable(int cpu) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+/* prepare/enable/disable routines */
+static int watchdog_prepare_cpu(int cpu)
+{
+        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+        WARN_ON(per_cpu(softlockup_watchdog, cpu));
+        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hrtimer->function = watchdog_timer_fn;
+        return 0;
+}
+static int watchdog_enable(int cpu)
+{
+        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+        int err;
+        /* enable the perf event */
+        err = watchdog_nmi_enable(cpu);
+        if (err)
+                return err;
+        /* create the watchdog thread */
+        if (!p) {
+                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+                if (IS_ERR(p)) {
+                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+                        return PTR_ERR(p);
+                }
+                kthread_bind(p, cpu);
+                per_cpu(watchdog_touch_ts, cpu) = 0;
+                per_cpu(softlockup_watchdog, cpu) = p;
+                wake_up_process(p);
+        }
+        /* if any cpu succeeds, watchdog is considered enabled for the system */
+        watchdog_enabled = 1;
+        return 0;
+}
+static void watchdog_disable(int cpu)
+{
+        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+        /*
+         * cancel the timer first to stop incrementing the stats
+         * and waking up the kthread
+         */
+        hrtimer_cancel(hrtimer);
+        /* disable the perf event */
+        watchdog_nmi_disable(cpu);
+        /* stop the watchdog thread */
+        if (p) {
+                per_cpu(softlockup_watchdog, cpu) = NULL;
+                kthread_stop(p);
+        }
+}
+static void watchdog_enable_all_cpus(void)
+{
+        int cpu;
+        int result = 0;
+        for_each_online_cpu(cpu)
+                result += watchdog_enable(cpu);
+        if (result)
+                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+}
+static void watchdog_disable_all_cpus(void)
+{
+        int cpu;
+        if (no_watchdog)
+                return;
+        for_each_online_cpu(cpu)
+                watchdog_disable(cpu);
+        /* if all watchdogs are disabled, then they are disabled for the system */
+        watchdog_enabled = 0;
+}
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *length, loff_t *ppos)
+{
+        proc_dointvec(table, write, buffer, length, ppos);
+        if (watchdog_enabled)
+                watchdog_enable_all_cpus();
+        else
+                watchdog_disable_all_cpus();
+        return 0;
+}
+int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+                             void __user *buffer,
+                             size_t *lenp, loff_t *ppos)
+{
+        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif /* CONFIG_SYSCTL */
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int hotcpu = (unsigned long)hcpu;
+        int err = 0;
+        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                err = watchdog_prepare_cpu(hotcpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                err = watchdog_enable(hotcpu);
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                watchdog_disable(hotcpu);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                watchdog_disable(hotcpu);
+                break;
+#endif /* CONFIG_HOTPLUG_CPU */
+        }
+        return notifier_from_errno(err);
+}
+static struct notifier_block __cpuinitdata cpu_nfb = {
+        .notifier_call = cpu_callback
+};
+static int __init spawn_watchdog_task(void)
+{
+        void *cpu = (void *)(long)smp_processor_id();
+        int err;
+        if (no_watchdog)
+                return 0;
+        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        WARN_ON(notifier_to_errno(err));
+        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+        register_cpu_notifier(&cpu_nfb);
+        return 0;
+}
+early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
 /*
- * linux/kernel/workqueue.c
+ * kernel/workqueue.c - generic async execution with shared worker pool
 *
- * Generic mechanism for defining kernel helper threads for running
+ * Copyright (C) 2002           Ingo Molnar
- * arbitrary tasks in process context.
 *
- * Started by Ingo Molnar, Copyright (C) 2002
+ *   Derived from the taskqueue/keventd code by:
+ *     David Woodhouse <dwmw2@infradead.org>
+ *     Andrew Morton
+ *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
+ *     Theodore Ts'o <tytso@mit.edu>
 *
- * Derived from the taskqueue/keventd code by:
+ * Made to use alloc_percpu by Christoph Lameter.
 *
- *   David Woodhouse <dwmw2@infradead.org>
+ * Copyright (C) 2010           SUSE Linux Products GmbH
- *   Andrew Morton
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
- *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
- *   Theodore Ts'o <tytso@mit.edu>
 *
- * Made to use alloc_percpu by Christoph Lameter.
+ * This is the generic async execution mechanism.  Work items as are
+ * executed in process context.  The worker pool is shared and
+ * automatically managed.  There is one worker pool for each CPU and
+ * one extra for works which are better served by workers which are
+ * not bound to any specific CPU.
+ *
+ * Please read Documentation/workqueue.txt for details.
 */
 #include <linux/module.h>
@@ -33,41 +40,276 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
-#define CREATE_TRACE_POINTS
+#include <linux/idr.h>
-#include <trace/events/workqueue.h>
+#include "workqueue_sched.h"
+enum {
+        /* global_cwq flags */
+        GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+        GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+        GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+        GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
+        GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
+        /* worker flags */
+        WORKER_STARTED          = 1 << 0,       /* started */
+        WORKER_DIE              = 1 << 1,       /* die die die */
+        WORKER_IDLE             = 1 << 2,       /* is idle */
+        WORKER_PREP             = 1 << 3,       /* preparing to run works */
+        WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
+        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
+        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
+        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+                                  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+        /* gcwq->trustee_state */
+        TRUSTEE_START           = 0,            /* start */
+        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
+        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
+        TRUSTEE_RELEASE         = 3,            /* release workers */
+        TRUSTEE_DONE            = 4,            /* trustee is done */
+        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
+        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
+        BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
+        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
+        IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
+        MAYDAY_INITIAL_TIMEOUT  = HZ / 100,     /* call for help after 10ms */
+        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
+        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
+        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
+        /*
+         * Rescue workers are used only on emergencies and shared by
+         * all cpus.  Give -20.
+         */
+        RESCUER_NICE_LEVEL      = -20,
+};
 /*
- * The per-CPU workqueue (if single thread, we always use the first
+ * Structure fields follow one of the following exclusion rules.
- * possible cpu).
+ *
+ * I: Modifiable by initialization/destruction paths and read-only for
+ *    everyone else.
+ *
+ * P: Preemption protected.  Disabling preemption is enough and should
+ *    only be modified and accessed from the local cpu.
+ *
+ * L: gcwq->lock protected.  Access with gcwq->lock held.
+ *
+ * X: During normal operation, modification requires gcwq->lock and
+ *    should be done only from local cpu.  Either disabling preemption
+ *    on local cpu or grabbing gcwq->lock is enough for read access.
+ *    If GCWQ_DISASSOCIATED is set, it's identical to L.
+ *
+ * F: wq->flush_mutex protected.
+ *
+ * W: workqueue_lock protected.
 */
-struct cpu_workqueue_struct {
-        spinlock_t lock;
+struct global_cwq;
-        struct list_head worklist;
+/*
-        wait_queue_head_t more_work;
+ * The poor guys doing the actual heavy lifting.  All on-duty workers
-        struct work_struct *current_work;
+ * are either serving the manager role, on idle list or on busy hash.
+ */
+struct worker {
+        /* on idle list while idle, on busy hash table while busy */
+        union {
+                struct list_head        entry;  /* L: while idle */
+                struct hlist_node       hentry; /* L: while busy */
+        };
-        struct workqueue_struct *wq;
+        struct work_struct      *current_work;  /* L: work being processed */
-        struct task_struct *thread;
+        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
-} ____cacheline_aligned;
+        struct list_head        scheduled;      /* L: scheduled works */
+        struct task_struct      *task;          /* I: worker task */
+        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        /* 64 bytes boundary on 64bit, 32 on 32bit */
+        unsigned long           last_active;    /* L: last active timestamp */
+        unsigned int            flags;          /* X: flags */
+        int                     id;             /* I: worker id */
+        struct work_struct      rebind_work;    /* L: rebind worker to cpu */
+};
+/*
+ * Global per-cpu workqueue.  There's one and only one for each cpu
+ * and all works are queued and processed here regardless of their
+ * target workqueues.
+ */
+struct global_cwq {
+        spinlock_t              lock;           /* the gcwq lock */
+        struct list_head        worklist;       /* L: list of pending works */
+        unsigned int            cpu;            /* I: the associated cpu */
+        unsigned int            flags;          /* L: GCWQ_* flags */
+        int                     nr_workers;     /* L: total number of workers */
+        int                     nr_idle;        /* L: currently idle ones */
+        /* workers are chained either in the idle_list or busy_hash */
+        struct list_head        idle_list;      /* X: list of idle workers */
+        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
+                                                /* L: hash of busy workers */
+        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
+        struct ida              worker_ida;     /* L: for worker IDs */
+        struct task_struct      *trustee;       /* L: for gcwq shutdown */
+        unsigned int            trustee_state;  /* L: trustee state */
+        wait_queue_head_t       trustee_wait;   /* trustee wait */
+        struct worker           *first_idle;    /* L: first idle worker */
+} ____cacheline_aligned_in_smp;
+/*
+ * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
+ * work_struct->data are used for flags and thus cwqs need to be
+ * aligned at two's power of the number of flag bits.
+ */
+struct cpu_workqueue_struct {
+        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct workqueue_struct *wq;            /* I: the owning workqueue */
+        int                     work_color;     /* L: current color */
+        int                     flush_color;    /* L: flushing color */
+        int                     nr_in_flight[WORK_NR_COLORS];
+                                                /* L: nr of in_flight works */
+        int                     nr_active;      /* L: nr of active works */
+        int                     max_active;     /* L: max active works */
+        struct list_head        delayed_works;  /* L: delayed works */
+};
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+        struct list_head        list;           /* F: list of flushers */
+        int                     flush_color;    /* F: flush color waiting for */
+        struct completion       done;           /* flush completion */
+};
+/*
+ * All cpumasks are assumed to be always set on UP and thus can't be
+ * used to determine whether there's something to be done.
+ */
+#ifdef CONFIG_SMP
+typedef cpumask_var_t mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask)      \
+        cpumask_test_and_set_cpu((cpu), (mask))
+#define mayday_clear_cpu(cpu, mask)             cpumask_clear_cpu((cpu), (mask))
+#define for_each_mayday_cpu(cpu, mask)          for_each_cpu((cpu), (mask))
+#define alloc_mayday_mask(maskp, gfp)           zalloc_cpumask_var((maskp), (gfp))
+#define free_mayday_mask(mask)                  free_cpumask_var((mask))
+#else
+typedef unsigned long mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask)      test_and_set_bit(0, &(mask))
+#define mayday_clear_cpu(cpu, mask)             clear_bit(0, &(mask))
+#define for_each_mayday_cpu(cpu, mask)          if ((cpu) = 0, (mask))
+#define alloc_mayday_mask(maskp, gfp)           true
+#define free_mayday_mask(mask)                  do { } while (0)
+#endif
 /*
 * The externally visible workqueue abstraction is an array of
 * per-CPU workqueues:
 */
 struct workqueue_struct {
-        struct cpu_workqueue_struct *cpu_wq;
+        unsigned int            flags;          /* I: WQ_* flags */
-        struct list_head list;
+        union {
-        const char *name;
+                struct cpu_workqueue_struct __percpu    *pcpu;
-        int singlethread;
+                struct cpu_workqueue_struct             *single;
-        int freezeable;         /* Freeze threads during suspend */
+                unsigned long                           v;
-        int rt;
+        } cpu_wq;                               /* I: cwq's */
+        struct list_head        list;           /* W: list of all workqueues */
+        struct mutex            flush_mutex;    /* protects wq flushing */
+        int                     work_color;     /* F: current work color */
+        int                     flush_color;    /* F: current flush color */
+        atomic_t                nr_cwqs_to_flush; /* flush in progress */
+        struct wq_flusher       *first_flusher; /* F: first flusher */
+        struct list_head        flusher_queue;  /* F: flush waiters */
+        struct list_head        flusher_overflow; /* F: flush overflow list */
+        mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
+        struct worker           *rescuer;       /* I: rescue worker */
+        int                     saved_max_active; /* W: saved cwq max_active */
+        const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
-        struct lockdep_map lockdep_map;
+        struct lockdep_map      lockdep_map;
 #endif
 };
+struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_nrt_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
+#define for_each_busy_worker(worker, i, pos, gcwq)                      \
+        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
+                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+                                  unsigned int sw)
+{
+        if (cpu < nr_cpu_ids) {
+                if (sw & 1) {
+                        cpu = cpumask_next(cpu, mask);
+                        if (cpu < nr_cpu_ids)
+                                return cpu;
+                }
+                if (sw & 2)
+                        return WORK_CPU_UNBOUND;
+        }
+        return WORK_CPU_NONE;
+}
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+                                struct workqueue_struct *wq)
+{
+        return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+}
+/*
+ * CPU iterators
+ *
+ * An extra gcwq is defined for an invalid cpu number
+ * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
+ * specific CPU.  The following iterators are similar to
+ * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ *
+ * for_each_gcwq_cpu()          : possible CPUs + WORK_CPU_UNBOUND
+ * for_each_online_gcwq_cpu()   : online CPUs + WORK_CPU_UNBOUND
+ * for_each_cwq_cpu()           : possible CPUs for bound workqueues,
+ *                                WORK_CPU_UNBOUND for unbound workqueues
+ */
+#define for_each_gcwq_cpu(cpu)                                          \
+        for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);         \
+             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+#define for_each_online_gcwq_cpu(cpu)                                   \
+        for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);           \
+             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+#define for_each_cwq_cpu(cpu, wq)                                       \
+        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));        \
+             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 static struct debug_obj_descr work_debug_descr;
@@ -107,7 +349,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
                 * statically initialized. We just make sure that it
                 * is tracked in the object tracker.
                 */
-                if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+                if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
                        debug_object_init(work, &work_debug_descr);
                        debug_object_activate(work, &work_debug_descr);
                        return 0;
@@ -181,94 +423,586 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
+static bool workqueue_freezing;         /* W: have wqs started freezing? */
-static int singlethread_cpu __read_mostly;
-static const struct cpumask *cpu_singlethread_map __read_mostly;
 /*
- * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
+ * The almighty global cpu workqueues.  nr_running is the only field
- * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
+ * which is expected to be used frequently by other cpus via
- * which comes in between can't use for_each_online_cpu(). We could
+ * try_to_wake_up().  Put it in a separate cacheline.
- * use cpu_possible_map, the cpumask below is more a documentation
- * than optimization.
 */
-static cpumask_var_t cpu_populated_map __read_mostly;
+static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
-/* If it's single threaded, it isn't in the list of workqueues. */
+/*
-static inline int is_wq_single_threaded(struct workqueue_struct *wq)
+ * Global cpu workqueue and nr_running counter for unbound gcwq.  The
+ * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+ * workers have WORKER_UNBOUND set.
+ */
+static struct global_cwq unbound_global_cwq;
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
+static int worker_thread(void *__worker);
+static struct global_cwq *get_gcwq(unsigned int cpu)
+{
+        if (cpu != WORK_CPU_UNBOUND)
+                return &per_cpu(global_cwq, cpu);
+        else
+                return &unbound_global_cwq;
+}
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+{
+        if (cpu != WORK_CPU_UNBOUND)
+                return &per_cpu(gcwq_nr_running, cpu);
+        else
+                return &unbound_gcwq_nr_running;
+}
+static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+                                            struct workqueue_struct *wq)
+{
+        if (!(wq->flags & WQ_UNBOUND)) {
+                if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
+                        return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#else
+                        return wq->cpu_wq.single;
+#endif
+                }
+        } else if (likely(cpu == WORK_CPU_UNBOUND))
+                return wq->cpu_wq.single;
+        return NULL;
+}
+static unsigned int work_color_to_flags(int color)
 {
-        return wq->singlethread;
+        return color << WORK_STRUCT_COLOR_SHIFT;
 }
-static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
+static int get_work_color(struct work_struct *work)
 {
-        return is_wq_single_threaded(wq)
+        return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
-                ? cpu_singlethread_map : cpu_populated_map;
+                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
 }
-static
+static int work_next_color(int color)
-struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
 {
-        if (unlikely(is_wq_single_threaded(wq)))
+        return (color + 1) % WORK_NR_COLORS;
-                cpu = singlethread_cpu;
-        return per_cpu_ptr(wq->cpu_wq, cpu);
 }
 /*
- * Set the workqueue on which a work item is to be run
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
- * - Must *only* be called if the pending flag is set
+ * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
+ * cleared and the work data contains the cpu number it was last on.
+ *
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * cwq, cpu or clear work->data.  These functions should only be
+ * called while the work is owned - ie. while the PENDING bit is set.
+ *
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * corresponding to a work.  gcwq is available once the work has been
+ * queued anywhere after initialization.  cwq is available only from
+ * queueing until execution starts.
 */
-static inline void set_wq_data(struct work_struct *work,
+static inline void set_work_data(struct work_struct *work, unsigned long data,
-                                struct cpu_workqueue_struct *cwq)
+                                 unsigned long flags)
 {
-        unsigned long new;
        BUG_ON(!work_pending(work));
+        atomic_long_set(&work->data, data | flags | work_static(work));
+}
+static void set_work_cwq(struct work_struct *work,
+                         struct cpu_workqueue_struct *cwq,
+                         unsigned long extra_flags)
+{
+        set_work_data(work, (unsigned long)cwq,
+                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+}
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+{
+        set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+}
+static void clear_work_data(struct work_struct *work)
+{
+        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+}
+static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+{
+        unsigned long data = atomic_long_read(&work->data);
+        if (data & WORK_STRUCT_CWQ)
+                return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+        else
+                return NULL;
+}
+static struct global_cwq *get_work_gcwq(struct work_struct *work)
+{
+        unsigned long data = atomic_long_read(&work->data);
+        unsigned int cpu;
+        if (data & WORK_STRUCT_CWQ)
+                return ((struct cpu_workqueue_struct *)
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+        cpu = data >> WORK_STRUCT_FLAG_BITS;
+        if (cpu == WORK_CPU_NONE)
+                return NULL;
+        BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
+        return get_gcwq(cpu);
+}
+/*
+ * Policy functions.  These define the policies on how the global
+ * worker pool is managed.  Unless noted otherwise, these functions
+ * assume that they're being called with gcwq->lock held.
+ */
+static bool __need_more_worker(struct global_cwq *gcwq)
+{
+        return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+                gcwq->flags & GCWQ_HIGHPRI_PENDING;
+}
+/*
+ * Need to wake up a worker?  Called from anything but currently
+ * running workers.
+ */
+static bool need_more_worker(struct global_cwq *gcwq)
+{
+        return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+}
+/* Can I start working?  Called from busy but !running workers. */
+static bool may_start_working(struct global_cwq *gcwq)
+{
+        return gcwq->nr_idle;
+}
+/* Do I need to keep working?  Called from currently running workers. */
+static bool keep_working(struct global_cwq *gcwq)
+{
+        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+        return !list_empty(&gcwq->worklist) &&
+                (atomic_read(nr_running) <= 1 ||
+                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
+}
+/* Do we need a new worker?  Called from manager. */
+static bool need_to_create_worker(struct global_cwq *gcwq)
+{
+        return need_more_worker(gcwq) && !may_start_working(gcwq);
+}
+/* Do I need to be the manager? */
+static bool need_to_manage_workers(struct global_cwq *gcwq)
+{
+        return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+}
-        new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
+/* Do we have too many workers and should some go away? */
-        new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
+static bool too_many_workers(struct global_cwq *gcwq)
-        atomic_long_set(&work->data, new);
+{
+        bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+        int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+        int nr_busy = gcwq->nr_workers - nr_idle;
+        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
 /*
- * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
+ * Wake up functions.
+ */
+/* Return the first worker.  Safe with preemption disabled */
+static struct worker *first_worker(struct global_cwq *gcwq)
+{
+        if (unlikely(list_empty(&gcwq->idle_list)))
+                return NULL;
+        return list_first_entry(&gcwq->idle_list, struct worker, entry);
+}
+/**
+ * wake_up_worker - wake up an idle worker
+ * @gcwq: gcwq to wake worker for
+ *
+ * Wake up the first idle worker of @gcwq.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void wake_up_worker(struct global_cwq *gcwq)
+{
+        struct worker *worker = first_worker(gcwq);
+        if (likely(worker))
+                wake_up_process(worker->task);
+}
+/**
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+{
+        struct worker *worker = kthread_data(task);
+        if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+                atomic_inc(get_gcwq_nr_running(cpu));
+}
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep.  Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * RETURNS:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+                                       unsigned int cpu)
+{
+        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+        struct global_cwq *gcwq = get_gcwq(cpu);
+        atomic_t *nr_running = get_gcwq_nr_running(cpu);
+        if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+                return NULL;
+        /* this can only happen on the local cpu */
+        BUG_ON(cpu != raw_smp_processor_id());
+        /*
+         * The counterpart of the following dec_and_test, implied mb,
+         * worklist not empty test sequence is in insert_work().
+         * Please read comment there.
+         *
+         * NOT_RUNNING is clear.  This means that trustee is not in
+         * charge and we're running on the local cpu w/ rq lock held
+         * and preemption disabled, which in turn means that none else
+         * could be manipulating idle_list, so dereferencing idle_list
+         * without gcwq lock is safe.
+         */
+        if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+                to_wakeup = first_worker(gcwq);
+        return to_wakeup ? to_wakeup->task : NULL;
+}
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ * @wakeup: wakeup an idle worker if necessary
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly.  If
+ * nr_running becomes zero and @wakeup is %true, an idle worker is
+ * woken up.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
 */
-static inline void clear_wq_data(struct work_struct *work)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags,
+                                    bool wakeup)
 {
-        unsigned long flags = *work_data_bits(work) &
+        struct global_cwq *gcwq = worker->gcwq;
-                                (1UL << WORK_STRUCT_STATIC);
-        atomic_long_set(&work->data, flags);
+        WARN_ON_ONCE(worker->task != current);
+        /*
+         * If transitioning into NOT_RUNNING, adjust nr_running and
+         * wake up an idle worker as necessary if requested by
+         * @wakeup.
+         */
+        if ((flags & WORKER_NOT_RUNNING) &&
+            !(worker->flags & WORKER_NOT_RUNNING)) {
+                atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+                if (wakeup) {
+                        if (atomic_dec_and_test(nr_running) &&
+                            !list_empty(&gcwq->worklist))
+                                wake_up_worker(gcwq);
+                } else
+                        atomic_dec(nr_running);
+        }
+        worker->flags |= flags;
 }
-static inline
+/**
-struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-        return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
+        struct global_cwq *gcwq = worker->gcwq;
+        unsigned int oflags = worker->flags;
+        WARN_ON_ONCE(worker->task != current);
+        worker->flags &= ~flags;
+        /* if transitioning out of NOT_RUNNING, increment nr_running */
+        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+                if (!(worker->flags & WORKER_NOT_RUNNING))
+                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
 }
+/**
+ * busy_worker_head - return the busy hash head for a work
+ * @gcwq: gcwq of interest
+ * @work: work to be hashed
+ *
+ * Return hash head of @gcwq for @work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to the hash head.
+ */
+static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
+                                           struct work_struct *work)
+{
+        const int base_shift = ilog2(sizeof(struct work_struct));
+        unsigned long v = (unsigned long)work;
+        /* simple shift and fold hash, do we need something better? */
+        v >>= base_shift;
+        v += v >> BUSY_WORKER_HASH_ORDER;
+        v &= BUSY_WORKER_HASH_MASK;
+        return &gcwq->busy_hash[v];
+}
+/**
+ * __find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @bwh: hash head as returned by busy_worker_head()
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq.  @bwh should be
+ * the hash head obtained by calling busy_worker_head() with the same
+ * work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
+                                                   struct hlist_head *bwh,
+                                                   struct work_struct *work)
+{
+        struct worker *worker;
+        struct hlist_node *tmp;
+        hlist_for_each_entry(worker, tmp, bwh, hentry)
+                if (worker->current_work == work)
+                        return worker;
+        return NULL;
+}
+/**
+ * find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq.  This function is
+ * identical to __find_worker_executing_work() except that this
+ * function calculates @bwh itself.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
+                                                 struct work_struct *work)
+{
+        return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
+                                            work);
+}
+/**
+ * gcwq_determine_ins_pos - find insertion position
+ * @gcwq: gcwq of interest
+ * @cwq: cwq a work is being queued for
+ *
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * position for the work.  If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue.  This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to inserstion position.
+ */
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+                                               struct cpu_workqueue_struct *cwq)
+{
+        struct work_struct *twork;
+        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+                return &gcwq->worklist;
+        list_for_each_entry(twork, &gcwq->worklist, entry) {
+                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+                if (!(tcwq->wq->flags & WQ_HIGHPRI))
+                        break;
+        }
+        gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+        return &twork->entry;
+}
+/**
+ * insert_work - insert a work into gcwq
+ * @cwq: cwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work which belongs to @cwq into @gcwq after @head.
+ * @extra_flags is or'd to work_struct flags.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
 static void insert_work(struct cpu_workqueue_struct *cwq,
-                        struct work_struct *work, struct list_head *head)
+                        struct work_struct *work, struct list_head *head,
+                        unsigned int extra_flags)
 {
-        trace_workqueue_insertion(cwq->thread, work);
+        struct global_cwq *gcwq = cwq->gcwq;
+        /* we own @work, set data and link */
+        set_work_cwq(work, cwq, extra_flags);
-        set_wq_data(work, cwq);
        /*
         * Ensure that we get the right work->data if we see the
         * result of list_add() below, see try_to_grab_pending().
         */
        smp_wmb();
        list_add_tail(&work->entry, head);
-        wake_up(&cwq->more_work);
+        /*
+         * Ensure either worker_sched_deactivated() sees the above
+         * list_add_tail() or we see zero nr_running to avoid workers
+         * lying around lazily while there are works to be processed.
+         */
+        smp_mb();
+        if (__need_more_worker(gcwq))
+                wake_up_worker(gcwq);
 }
-static void __queue_work(struct cpu_workqueue_struct *cwq,
+static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
+        struct global_cwq *gcwq;
+        struct cpu_workqueue_struct *cwq;
+        struct list_head *worklist;
+        unsigned int work_flags;
        unsigned long flags;
        debug_work_activate(work);
-        spin_lock_irqsave(&cwq->lock, flags);
-        insert_work(cwq, work, &cwq->worklist);
+        if (WARN_ON_ONCE(wq->flags & WQ_DYING))
-        spin_unlock_irqrestore(&cwq->lock, flags);
+                return;
+        /* determine gcwq to use */
+        if (!(wq->flags & WQ_UNBOUND)) {
+                struct global_cwq *last_gcwq;
+                if (unlikely(cpu == WORK_CPU_UNBOUND))
+                        cpu = raw_smp_processor_id();
+                /*
+                 * It's multi cpu.  If @wq is non-reentrant and @work
+                 * was previously on a different cpu, it might still
+                 * be running there, in which case the work needs to
+                 * be queued on that cpu to guarantee non-reentrance.
+                 */
+                gcwq = get_gcwq(cpu);
+                if (wq->flags & WQ_NON_REENTRANT &&
+                    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+                        struct worker *worker;
+                        spin_lock_irqsave(&last_gcwq->lock, flags);
+                        worker = find_worker_executing_work(last_gcwq, work);
+                        if (worker && worker->current_cwq->wq == wq)
+                                gcwq = last_gcwq;
+                        else {
+                                /* meh... not running there, queue here */
+                                spin_unlock_irqrestore(&last_gcwq->lock, flags);
+                                spin_lock_irqsave(&gcwq->lock, flags);
+                        }
+                } else
+                        spin_lock_irqsave(&gcwq->lock, flags);
+        } else {
+                gcwq = get_gcwq(WORK_CPU_UNBOUND);
+                spin_lock_irqsave(&gcwq->lock, flags);
+        }
+        /* gcwq determined, get cwq and queue */
+        cwq = get_cwq(gcwq->cpu, wq);
+        trace_workqueue_queue_work(cpu, cwq, work);
+        BUG_ON(!list_empty(&work->entry));
+        cwq->nr_in_flight[cwq->work_color]++;
+        work_flags = work_color_to_flags(cwq->work_color);
+        if (likely(cwq->nr_active < cwq->max_active)) {
+                trace_workqueue_activate_work(work);
+                cwq->nr_active++;
+                worklist = gcwq_determine_ins_pos(gcwq, cwq);
+        } else {
+                work_flags |= WORK_STRUCT_DELAYED;
+                worklist = &cwq->delayed_works;
+        }
+        insert_work(cwq, work, worklist, work_flags);
+        spin_unlock_irqrestore(&gcwq->lock, flags);
 }
 /**
@@ -308,9 +1042,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
        int ret = 0;
-        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-                BUG_ON(!list_empty(&work->entry));
+                __queue_work(cpu, wq, work);
-                __queue_work(wq_per_cpu(wq, cpu), work);
                ret = 1;
        }
        return ret;
@@ -320,10 +1053,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
 static void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
-        struct workqueue_struct *wq = cwq->wq;
-        __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+        __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
 /**
@@ -360,14 +1092,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
-        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+                unsigned int lcpu;
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
                timer_stats_timer_set_start_info(&dwork->timer);
-                /* This stores cwq for the moment, for the timer_fn */
+                /*
-                set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+                 * This stores cwq for the moment, for the timer_fn.
+                 * Note that the work's gcwq is preserved to allow
+                 * reentrance detection for delayed works.
+                 */
+                if (!(wq->flags & WQ_UNBOUND)) {
+                        struct global_cwq *gcwq = get_work_gcwq(work);
+                        if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+                                lcpu = gcwq->cpu;
+                        else
+                                lcpu = raw_smp_processor_id();
+                } else
+                        lcpu = WORK_CPU_UNBOUND;
+                set_work_cwq(work, get_cwq(lcpu, wq), 0);
                timer->expires = jiffies + delay;
                timer->data = (unsigned long)dwork;
                timer->function = delayed_work_timer_fn;
@@ -382,80 +1131,889 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
-static void run_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state.  Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
 {
-        spin_lock_irq(&cwq->lock);
+        struct global_cwq *gcwq = worker->gcwq;
-        while (!list_empty(&cwq->worklist)) {
-                struct work_struct *work = list_entry(cwq->worklist.next,
+        BUG_ON(worker->flags & WORKER_IDLE);
-                                                struct work_struct, entry);
+        BUG_ON(!list_empty(&worker->entry) &&
-                work_func_t f = work->func;
+               (worker->hentry.next || worker->hentry.pprev));
-#ifdef CONFIG_LOCKDEP
+        /* can't use worker_set_flags(), also called from start_worker() */
+        worker->flags |= WORKER_IDLE;
+        gcwq->nr_idle++;
+        worker->last_active = jiffies;
+        /* idle_list is LIFO */
+        list_add(&worker->entry, &gcwq->idle_list);
+        if (likely(!(worker->flags & WORKER_ROGUE))) {
+                if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+                        mod_timer(&gcwq->idle_timer,
+                                  jiffies + IDLE_WORKER_TIMEOUT);
+        } else
+                wake_up_all(&gcwq->trustee_wait);
+        /* sanity check nr_running */
+        WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+}
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state.  Update stats.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+        struct global_cwq *gcwq = worker->gcwq;
+        BUG_ON(!(worker->flags & WORKER_IDLE));
+        worker_clr_flags(worker, WORKER_IDLE);
+        gcwq->nr_idle--;
+        list_del_init(&worker->entry);
+}
+/**
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * @worker: self
+ *
+ * Works which are scheduled while the cpu is online must at least be
+ * scheduled to a worker which is bound to the cpu so that if they are
+ * flushed from cpu callbacks while cpu is going down, they are
+ * guaranteed to execute on the cpu.
+ *
+ * This function is to be used by rogue workers and rescuers to bind
+ * themselves to the target cpu and may race with cpu going down or
+ * coming online.  kthread_bind() can't be used because it may put the
+ * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
+ * verbatim as it's best effort and blocking and gcwq may be
+ * [dis]associated in the meantime.
+ *
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * the binding against GCWQ_DISASSOCIATED which is set during
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * idle state or fetches works without dropping lock, it can guarantee
+ * the scheduling requirement described in the first paragraph.
+ *
+ * CONTEXT:
+ * Might sleep.  Called without any lock but returns with gcwq->lock
+ * held.
+ *
+ * RETURNS:
+ * %true if the associated gcwq is online (@worker is successfully
+ * bound), %false if offline.
+ */
+static bool worker_maybe_bind_and_lock(struct worker *worker)
+__acquires(&gcwq->lock)
+{
+        struct global_cwq *gcwq = worker->gcwq;
+        struct task_struct *task = worker->task;
+        while (true) {
                /*
-                 * It is permissible to free the struct work_struct
+                 * The following call may fail, succeed or succeed
-                 * from inside the function that is called from it,
+                 * without actually migrating the task to the cpu if
-                 * this we need to take into account for lockdep too.
+                 * it races with cpu hotunplug operation.  Verify
-                 * To avoid bogus "held lock freed" warnings as well
+                 * against GCWQ_DISASSOCIATED.
-                 * as problems when looking into work->lockdep_map,
-                 * make a copy and use that here.
                 */
-                struct lockdep_map lockdep_map = work->lockdep_map;
+                if (!(gcwq->flags & GCWQ_DISASSOCIATED))
-#endif
+                        set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
-                trace_workqueue_execution(cwq->thread, work);
-                debug_work_deactivate(work);
+                spin_lock_irq(&gcwq->lock);
-                cwq->current_work = work;
+                if (gcwq->flags & GCWQ_DISASSOCIATED)
-                list_del_init(cwq->worklist.next);
+                        return false;
-                spin_unlock_irq(&cwq->lock);
+                if (task_cpu(task) == gcwq->cpu &&
+                    cpumask_equal(&current->cpus_allowed,
-                BUG_ON(get_wq_data(work) != cwq);
+                                  get_cpu_mask(gcwq->cpu)))
-                work_clear_pending(work);
+                        return true;
-                lock_map_acquire(&cwq->wq->lockdep_map);
+                spin_unlock_irq(&gcwq->lock);
-                lock_map_acquire(&lockdep_map);
-                f(work);
+                /* CPU has come up inbetween, retry migration */
-                lock_map_release(&lockdep_map);
+                cpu_relax();
-                lock_map_release(&cwq->wq->lockdep_map);
+        }
+}
-                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-                        printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+/*
-                                        "%s/0x%08x/%d\n",
+ * Function for worker->rebind_work used to rebind rogue busy workers
-                                        current->comm, preempt_count(),
+ * to the associated cpu which is coming back online.  This is
-                                        task_pid_nr(current));
+ * scheduled by cpu up but can race with other cpu hotplug operations
-                        printk(KERN_ERR "    last function: ");
+ * and may be executed twice without intervening cpu down.
-                        print_symbol("%s\n", (unsigned long)f);
+ */
-                        debug_show_held_locks(current);
+static void worker_rebind_fn(struct work_struct *work)
-                        dump_stack();
+{
+        struct worker *worker = container_of(work, struct worker, rebind_work);
+        struct global_cwq *gcwq = worker->gcwq;
+        if (worker_maybe_bind_and_lock(worker))
+                worker_clr_flags(worker, WORKER_REBIND);
+        spin_unlock_irq(&gcwq->lock);
+}
+static struct worker *alloc_worker(void)
+{
+        struct worker *worker;
+        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+        if (worker) {
+                INIT_LIST_HEAD(&worker->entry);
+                INIT_LIST_HEAD(&worker->scheduled);
+                INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+                /* on creation a worker is in !idle && prep state */
+                worker->flags = WORKER_PREP;
+        }
+        return worker;
+}
+/**
+ * create_worker - create a new workqueue worker
+ * @gcwq: gcwq the new worker will belong to
+ * @bind: whether to set affinity to @cpu or not
+ *
+ * Create a new worker which is bound to @gcwq.  The returned worker
+ * can be started by calling start_worker() or destroyed using
+ * destroy_worker().
+ *
+ * CONTEXT:
+ * Might sleep.  Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * Pointer to the newly created worker.
+ */
+static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+{
+        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+        struct worker *worker = NULL;
+        int id = -1;
+        spin_lock_irq(&gcwq->lock);
+        while (ida_get_new(&gcwq->worker_ida, &id)) {
+                spin_unlock_irq(&gcwq->lock);
+                if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+                        goto fail;
+                spin_lock_irq(&gcwq->lock);
+        }
+        spin_unlock_irq(&gcwq->lock);
+        worker = alloc_worker();
+        if (!worker)
+                goto fail;
+        worker->gcwq = gcwq;
+        worker->id = id;
+        if (!on_unbound_cpu)
+                worker->task = kthread_create(worker_thread, worker,
+                                              "kworker/%u:%d", gcwq->cpu, id);
+        else
+                worker->task = kthread_create(worker_thread, worker,
+                                              "kworker/u:%d", id);
+        if (IS_ERR(worker->task))
+                goto fail;
+        /*
+         * A rogue worker will become a regular one if CPU comes
+         * online later on.  Make sure every worker has
+         * PF_THREAD_BOUND set.
+         */
+        if (bind && !on_unbound_cpu)
+                kthread_bind(worker->task, gcwq->cpu);
+        else {
+                worker->task->flags |= PF_THREAD_BOUND;
+                if (on_unbound_cpu)
+                        worker->flags |= WORKER_UNBOUND;
+        }
+        return worker;
+fail:
+        if (id >= 0) {
+                spin_lock_irq(&gcwq->lock);
+                ida_remove(&gcwq->worker_ida, id);
+                spin_unlock_irq(&gcwq->lock);
+        }
+        kfree(worker);
+        return NULL;
+}
+/**
+ * start_worker - start a newly created worker
+ * @worker: worker to start
+ *
+ * Make the gcwq aware of @worker and start it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void start_worker(struct worker *worker)
+{
+        worker->flags |= WORKER_STARTED;
+        worker->gcwq->nr_workers++;
+        worker_enter_idle(worker);
+        wake_up_process(worker->task);
+}
+/**
+ * destroy_worker - destroy a workqueue worker
+ * @worker: worker to be destroyed
+ *
+ * Destroy @worker and adjust @gcwq stats accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void destroy_worker(struct worker *worker)
+{
+        struct global_cwq *gcwq = worker->gcwq;
+        int id = worker->id;
+        /* sanity check frenzy */
+        BUG_ON(worker->current_work);
+        BUG_ON(!list_empty(&worker->scheduled));
+        if (worker->flags & WORKER_STARTED)
+                gcwq->nr_workers--;
+        if (worker->flags & WORKER_IDLE)
+                gcwq->nr_idle--;
+        list_del_init(&worker->entry);
+        worker->flags |= WORKER_DIE;
+        spin_unlock_irq(&gcwq->lock);
+        kthread_stop(worker->task);
+        kfree(worker);
+        spin_lock_irq(&gcwq->lock);
+        ida_remove(&gcwq->worker_ida, id);
+}
+static void idle_worker_timeout(unsigned long __gcwq)
+{
+        struct global_cwq *gcwq = (void *)__gcwq;
+        spin_lock_irq(&gcwq->lock);
+        if (too_many_workers(gcwq)) {
+                struct worker *worker;
+                unsigned long expires;
+                /* idle_list is kept in LIFO order, check the last one */
+                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+                if (time_before(jiffies, expires))
+                        mod_timer(&gcwq->idle_timer, expires);
+                else {
+                        /* it's been idle for too long, wake up manager */
+                        gcwq->flags |= GCWQ_MANAGE_WORKERS;
+                        wake_up_worker(gcwq);
                }
+        }
+        spin_unlock_irq(&gcwq->lock);
+}
+static bool send_mayday(struct work_struct *work)
+{
+        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct workqueue_struct *wq = cwq->wq;
+        unsigned int cpu;
+        if (!(wq->flags & WQ_RESCUER))
+                return false;
+        /* mayday mayday mayday */
+        cpu = cwq->gcwq->cpu;
+        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+        if (cpu == WORK_CPU_UNBOUND)
+                cpu = 0;
+        if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
+                wake_up_process(wq->rescuer->task);
+        return true;
+}
+static void gcwq_mayday_timeout(unsigned long __gcwq)
+{
+        struct global_cwq *gcwq = (void *)__gcwq;
+        struct work_struct *work;
+        spin_lock_irq(&gcwq->lock);
-                spin_lock_irq(&cwq->lock);
+        if (need_to_create_worker(gcwq)) {
-                cwq->current_work = NULL;
+                /*
+                 * We've been trying to create a new worker but
+                 * haven't been successful.  We might be hitting an
+                 * allocation deadlock.  Send distress signals to
+                 * rescuers.
+                 */
+                list_for_each_entry(work, &gcwq->worklist, entry)
+                        send_mayday(work);
        }
-        spin_unlock_irq(&cwq->lock);
+        spin_unlock_irq(&gcwq->lock);
+        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
-static int worker_thread(void *__cwq)
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @gcwq: gcwq to create a new worker for
+ *
+ * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * have at least one idle worker on return from this function.  If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @gcwq to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be false and
+ * may_start_working() true.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.  Called only from
+ * manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_create_worker(struct global_cwq *gcwq)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
 {
-        struct cpu_workqueue_struct *cwq = __cwq;
+        if (!need_to_create_worker(gcwq))
-        DEFINE_WAIT(wait);
+                return false;
+restart:
+        spin_unlock_irq(&gcwq->lock);
+        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+        while (true) {
+                struct worker *worker;
+                worker = create_worker(gcwq, true);
+                if (worker) {
+                        del_timer_sync(&gcwq->mayday_timer);
+                        spin_lock_irq(&gcwq->lock);
+                        start_worker(worker);
+                        BUG_ON(need_to_create_worker(gcwq));
+                        return true;
+                }
+                if (!need_to_create_worker(gcwq))
+                        break;
+                __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(CREATE_COOLDOWN);
+                if (!need_to_create_worker(gcwq))
+                        break;
+        }
+        del_timer_sync(&gcwq->mayday_timer);
+        spin_lock_irq(&gcwq->lock);
+        if (need_to_create_worker(gcwq))
+                goto restart;
+        return true;
+}
-        if (cwq->wq->freezeable)
+/**
-                set_freezable();
+ * maybe_destroy_worker - destroy workers which have been idle for a while
+ * @gcwq: gcwq to destroy workers for
+ *
+ * Destroy @gcwq workers which have been idle for longer than
+ * IDLE_WORKER_TIMEOUT.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Called only from manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
+{
+        bool ret = false;
-        for (;;) {
+        while (too_many_workers(gcwq)) {
-                prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+                struct worker *worker;
-                if (!freezing(current) &&
+                unsigned long expires;
-                    !kthread_should_stop() &&
-                    list_empty(&cwq->worklist))
-                        schedule();
-                finish_wait(&cwq->more_work, &wait);
-                try_to_freeze();
+                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-                if (kthread_should_stop())
+                if (time_before(jiffies, expires)) {
+                        mod_timer(&gcwq->idle_timer, expires);
                        break;
+                }
-                run_workqueue(cwq);
+                destroy_worker(worker);
+                ret = true;
        }
-        return 0;
+        return ret;
+}
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage gcwq worker pool @worker belongs
+ * to.  At any given time, there can be only zero or one manager per
+ * gcwq.  The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return.  On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true if
+ * some action was taken.
+ */
+static bool manage_workers(struct worker *worker)
+{
+        struct global_cwq *gcwq = worker->gcwq;
+        bool ret = false;
+        if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+                return ret;
+        gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+        gcwq->flags |= GCWQ_MANAGING_WORKERS;
+        /*
+         * Destroy and then create so that may_start_working() is true
+         * on return.
+         */
+        ret |= maybe_destroy_workers(gcwq);
+        ret |= maybe_create_worker(gcwq);
+        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+        /*
+         * The trustee might be waiting to take over the manager
+         * position, tell it we're done.
+         */
+        if (unlikely(gcwq->trustee))
+                wake_up_all(&gcwq->trustee_wait);
+        return ret;
+}
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+                              struct work_struct **nextp)
+{
+        struct work_struct *n;
+        /*
+         * Linked worklist will always end before the end of the list,
+         * use NULL for list head.
+         */
+        list_for_each_entry_safe_from(work, n, NULL, entry) {
+                list_move_tail(&work->entry, head);
+                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+                        break;
+        }
+        /*
+         * If we're already inside safe list traversal and have moved
+         * multiple works to the scheduled queue, the next position
+         * needs to be updated.
+         */
+        if (nextp)
+                *nextp = n;
+}
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+        struct work_struct *work = list_first_entry(&cwq->delayed_works,
+                                                    struct work_struct, entry);
+        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+        trace_workqueue_activate_work(work);
+        move_linked_works(work, pos, NULL);
+        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+        cwq->nr_active++;
+}
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ * @delayed: for a delayed work
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
+                                 bool delayed)
+{
+        /* ignore uncolored works */
+        if (color == WORK_NO_COLOR)
+                return;
+        cwq->nr_in_flight[color]--;
+        if (!delayed) {
+                cwq->nr_active--;
+                if (!list_empty(&cwq->delayed_works)) {
+                        /* one down, submit a delayed one */
+                        if (cwq->nr_active < cwq->max_active)
+                                cwq_activate_first_delayed(cwq);
+                }
+        }
+        /* is flush in progress and are we at the flushing tip? */
+        if (likely(cwq->flush_color != color))
+                return;
+        /* are there still in-flight works? */
+        if (cwq->nr_in_flight[color])
+                return;
+        /* this cwq is done, clear flush_color */
+        cwq->flush_color = -1;
+        /*
+         * If this was the last cwq, wake up the first flusher.  It
+         * will handle the rest.
+         */
+        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+                complete(&cwq->wq->first_flusher->done);
+}
+/**
+ * process_one_work - process single work
+ * @worker: self
+ * @work: work to process
+ *
+ * Process @work.  This function contains all the logics necessary to
+ * process a single work including synchronization against and
+ * interaction with other workers on the same cpu, queueing and
+ * flushing.  As long as context requirement is met, any worker can
+ * call this function to process a work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void process_one_work(struct worker *worker, struct work_struct *work)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
+{
+        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct global_cwq *gcwq = cwq->gcwq;
+        struct hlist_head *bwh = busy_worker_head(gcwq, work);
+        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
+        work_func_t f = work->func;
+        int work_color;
+        struct worker *collision;
+#ifdef CONFIG_LOCKDEP
+        /*
+         * It is permissible to free the struct work_struct from
+         * inside the function that is called from it, this we need to
+         * take into account for lockdep too.  To avoid bogus "held
+         * lock freed" warnings as well as problems when looking into
+         * work->lockdep_map, make a copy and use that here.
+         */
+        struct lockdep_map lockdep_map = work->lockdep_map;
+#endif
+        /*
+         * A single work shouldn't be executed concurrently by
+         * multiple workers on a single cpu.  Check whether anyone is
+         * already processing the work.  If so, defer the work to the
+         * currently executing one.
+         */
+        collision = __find_worker_executing_work(gcwq, bwh, work);
+        if (unlikely(collision)) {
+                move_linked_works(work, &collision->scheduled, NULL);
+                return;
+        }
+        /* claim and process */
+        debug_work_deactivate(work);
+        hlist_add_head(&worker->hentry, bwh);
+        worker->current_work = work;
+        worker->current_cwq = cwq;
+        work_color = get_work_color(work);
+        /* record the current cpu number in the work data and dequeue */
+        set_work_cpu(work, gcwq->cpu);
+        list_del_init(&work->entry);
+        /*
+         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
+         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
+         */
+        if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+                struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+                                                struct work_struct, entry);
+                if (!list_empty(&gcwq->worklist) &&
+                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
+                        wake_up_worker(gcwq);
+                else
+                        gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+        }
+        /*
+         * CPU intensive works don't participate in concurrency
+         * management.  They're the scheduler's responsibility.
+         */
+        if (unlikely(cpu_intensive))
+                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+        spin_unlock_irq(&gcwq->lock);
+        work_clear_pending(work);
+        lock_map_acquire(&cwq->wq->lockdep_map);
+        lock_map_acquire(&lockdep_map);
+        trace_workqueue_execute_start(work);
+        f(work);
+        /*
+         * While we must be careful to not use "work" after this, the trace
+         * point will only record its address.
+         */
+        trace_workqueue_execute_end(work);
+        lock_map_release(&lockdep_map);
+        lock_map_release(&cwq->wq->lockdep_map);
+        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+                printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+                       "%s/0x%08x/%d\n",
+                       current->comm, preempt_count(), task_pid_nr(current));
+                printk(KERN_ERR "    last function: ");
+                print_symbol("%s\n", (unsigned long)f);
+                debug_show_held_locks(current);
+                dump_stack();
+        }
+        spin_lock_irq(&gcwq->lock);
+        /* clear cpu intensive status */
+        if (unlikely(cpu_intensive))
+                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+        /* we're done with it, release */
+        hlist_del_init(&worker->hentry);
+        worker->current_work = NULL;
+        worker->current_cwq = NULL;
+        cwq_dec_nr_in_flight(cwq, work_color, false);
+}
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works.  Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
+{
+        while (!list_empty(&worker->scheduled)) {
+                struct work_struct *work = list_first_entry(&worker->scheduled,
+                                                struct work_struct, entry);
+                process_one_work(worker, work);
+        }
+}
+/**
+ * worker_thread - the worker thread function
+ * @__worker: self
+ *
+ * The gcwq worker thread function.  There's a single dynamic pool of
+ * these per each cpu.  These workers process all works regardless of
+ * their specific target workqueue.  The only exception is works which
+ * belong to workqueues with a rescuer which will be explained in
+ * rescuer_thread().
+ */
+static int worker_thread(void *__worker)
+{
+        struct worker *worker = __worker;
+        struct global_cwq *gcwq = worker->gcwq;
+        /* tell the scheduler that this is a workqueue worker */
+        worker->task->flags |= PF_WQ_WORKER;
+woke_up:
+        spin_lock_irq(&gcwq->lock);
+        /* DIE can be set only while we're idle, checking here is enough */
+        if (worker->flags & WORKER_DIE) {
+                spin_unlock_irq(&gcwq->lock);
+                worker->task->flags &= ~PF_WQ_WORKER;
+                return 0;
+        }
+        worker_leave_idle(worker);
+recheck:
+        /* no more worker necessary? */
+        if (!need_more_worker(gcwq))
+                goto sleep;
+        /* do we need to manage? */
+        if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+                goto recheck;
+        /*
+         * ->scheduled list can only be filled while a worker is
+         * preparing to process a work or actually processing it.
+         * Make sure nobody diddled with it while I was sleeping.
+         */
+        BUG_ON(!list_empty(&worker->scheduled));
+        /*
+         * When control reaches this point, we're guaranteed to have
+         * at least one idle worker or that someone else has already
+         * assumed the manager role.
+         */
+        worker_clr_flags(worker, WORKER_PREP);
+        do {
+                struct work_struct *work =
+                        list_first_entry(&gcwq->worklist,
+                                         struct work_struct, entry);
+                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
+                        /* optimization path, not strictly necessary */
+                        process_one_work(worker, work);
+                        if (unlikely(!list_empty(&worker->scheduled)))
+                                process_scheduled_works(worker);
+                } else {
+                        move_linked_works(work, &worker->scheduled, NULL);
+                        process_scheduled_works(worker);
+                }
+        } while (keep_working(gcwq));
+        worker_set_flags(worker, WORKER_PREP, false);
+sleep:
+        if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+                goto recheck;
+        /*
+         * gcwq->lock is held and there's no work to process and no
+         * need to manage, sleep.  Workers are woken up only while
+         * holding gcwq->lock or from local cpu, so setting the
+         * current state before releasing gcwq->lock is enough to
+         * prevent losing any event.
+         */
+        worker_enter_idle(worker);
+        __set_current_state(TASK_INTERRUPTIBLE);
+        spin_unlock_irq(&gcwq->lock);
+        schedule();
+        goto woke_up;
+}
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__wq: the associated workqueue
+ *
+ * Workqueue rescuer thread function.  There's one rescuer for each
+ * workqueue which has WQ_RESCUER set.
+ *
+ * Regular work processing on a gcwq may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation.  This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the gcwq summons rescuers of all
+ * workqueues which have works queued on the gcwq and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ */
+static int rescuer_thread(void *__wq)
+{
+        struct workqueue_struct *wq = __wq;
+        struct worker *rescuer = wq->rescuer;
+        struct list_head *scheduled = &rescuer->scheduled;
+        bool is_unbound = wq->flags & WQ_UNBOUND;
+        unsigned int cpu;
+        set_user_nice(current, RESCUER_NICE_LEVEL);
+repeat:
+        set_current_state(TASK_INTERRUPTIBLE);
+        if (kthread_should_stop())
+                return 0;
+        /*
+         * See whether any cpu is asking for help.  Unbounded
+         * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+         */
+        for_each_mayday_cpu(cpu, wq->mayday_mask) {
+                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
+                struct global_cwq *gcwq = cwq->gcwq;
+                struct work_struct *work, *n;
+                __set_current_state(TASK_RUNNING);
+                mayday_clear_cpu(cpu, wq->mayday_mask);
+                /* migrate to the target cpu if possible */
+                rescuer->gcwq = gcwq;
+                worker_maybe_bind_and_lock(rescuer);
+                /*
+                 * Slurp in all works issued via this workqueue and
+                 * process'em.
+                 */
+                BUG_ON(!list_empty(&rescuer->scheduled));
+                list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+                        if (get_work_cwq(work) == cwq)
+                                move_linked_works(work, scheduled, &n);
+                process_scheduled_works(rescuer);
+                spin_unlock_irq(&gcwq->lock);
+        }
+        schedule();
+        goto repeat;
 }
 struct wq_barrier {
@@ -469,44 +2027,137 @@ static void wq_barrier_func(struct work_struct *work)
        complete(&barr->done);
 }
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @cwq: cwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
+ *
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution.  Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled.  This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine cwq from @target.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-                        struct wq_barrier *barr, struct list_head *head)
+                              struct wq_barrier *barr,
+                              struct work_struct *target, struct worker *worker)
 {
+        struct list_head *head;
+        unsigned int linked = 0;
        /*
-         * debugobject calls are safe here even with cwq->lock locked
+         * debugobject calls are safe here even with gcwq->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         */
-        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
+        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
-        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        /*
+         * If @target is currently being executed, schedule the
+         * barrier to the worker; otherwise, put it after @target.
+         */
+        if (worker)
+                head = worker->scheduled.next;
+        else {
+                unsigned long *bits = work_data_bits(target);
+                head = target->entry.next;
+                /* there can already be other linked works, inherit and set */
+                linked = *bits & WORK_STRUCT_LINKED;
+                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
+        }
        debug_work_activate(&barr->work);
-        insert_work(cwq, &barr->work, head);
+        insert_work(cwq, &barr->work, head,
+                    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
-static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare cwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all cwqs should be
+ * -1.  If no cwq has in-flight commands at the specified color, all
+ * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
+ * has in flight commands, its cwq->flush_color is set to
+ * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color.  If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all cwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->flush_mutex).
+ *
+ * RETURNS:
+ * %true if @flush_color >= 0 and there's something to flush.  %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+                                      int flush_color, int work_color)
 {
-        int active = 0;
+        bool wait = false;
-        struct wq_barrier barr;
+        unsigned int cpu;
-        WARN_ON(cwq->thread == current);
-        spin_lock_irq(&cwq->lock);
+        if (flush_color >= 0) {
-        if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+                BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
-                insert_wq_barrier(cwq, &barr, &cwq->worklist);
+                atomic_set(&wq->nr_cwqs_to_flush, 1);
-                active = 1;
        }
-        spin_unlock_irq(&cwq->lock);
-        if (active) {
+        for_each_cwq_cpu(cpu, wq) {
-                wait_for_completion(&barr.done);
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                destroy_work_on_stack(&barr.work);
+                struct global_cwq *gcwq = cwq->gcwq;
+                spin_lock_irq(&gcwq->lock);
+                if (flush_color >= 0) {
+                        BUG_ON(cwq->flush_color != -1);
+                        if (cwq->nr_in_flight[flush_color]) {
+                                cwq->flush_color = flush_color;
+                                atomic_inc(&wq->nr_cwqs_to_flush);
+                                wait = true;
+                        }
+                }
+                if (work_color >= 0) {
+                        BUG_ON(work_color != work_next_color(cwq->work_color));
+                        cwq->work_color = work_color;
+                }
+                spin_unlock_irq(&gcwq->lock);
        }
-        return active;
+        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+                complete(&wq->first_flusher->done);
+        return wait;
 }
 /**
@@ -518,158 +2169,340 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 *
 * We sleep until all works which were queued on entry have been handled,
 * but we are not livelocked by new incoming ones.
- *
- * This function used to run the workqueues itself.  Now we just wait for the
- * helper threads to do it.
 */
 void flush_workqueue(struct workqueue_struct *wq)
 {
-        const struct cpumask *cpu_map = wq_cpu_map(wq);
+        struct wq_flusher this_flusher = {
-        int cpu;
+                .list = LIST_HEAD_INIT(this_flusher.list),
+                .flush_color = -1,
+                .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+        };
+        int next_color;
-        might_sleep();
        lock_map_acquire(&wq->lockdep_map);
        lock_map_release(&wq->lockdep_map);
-        for_each_cpu(cpu, cpu_map)
-                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+        mutex_lock(&wq->flush_mutex);
+        /*
+         * Start-to-wait phase
+         */
+        next_color = work_next_color(wq->work_color);
+        if (next_color != wq->flush_color) {
+                /*
+                 * Color space is not full.  The current work_color
+                 * becomes our flush_color and work_color is advanced
+                 * by one.
+                 */
+                BUG_ON(!list_empty(&wq->flusher_overflow));
+                this_flusher.flush_color = wq->work_color;
+                wq->work_color = next_color;
+                if (!wq->first_flusher) {
+                        /* no flush in progress, become the first flusher */
+                        BUG_ON(wq->flush_color != this_flusher.flush_color);
+                        wq->first_flusher = &this_flusher;
+                        if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+                                                       wq->work_color)) {
+                                /* nothing to flush, done */
+                                wq->flush_color = next_color;
+                                wq->first_flusher = NULL;
+                                goto out_unlock;
+                        }
+                } else {
+                        /* wait in queue */
+                        BUG_ON(wq->flush_color == this_flusher.flush_color);
+                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
+                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+                }
+        } else {
+                /*
+                 * Oops, color space is full, wait on overflow queue.
+                 * The next flush completion will assign us
+                 * flush_color and transfer to flusher_queue.
+                 */
+                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+        }
+        mutex_unlock(&wq->flush_mutex);
+        wait_for_completion(&this_flusher.done);
+        /*
+         * Wake-up-and-cascade phase
+         *
+         * First flushers are responsible for cascading flushes and
+         * handling overflow.  Non-first flushers can simply return.
+         */
+        if (wq->first_flusher != &this_flusher)
+                return;
+        mutex_lock(&wq->flush_mutex);
+        /* we might have raced, check again with mutex held */
+        if (wq->first_flusher != &this_flusher)
+                goto out_unlock;
+        wq->first_flusher = NULL;
+        BUG_ON(!list_empty(&this_flusher.list));
+        BUG_ON(wq->flush_color != this_flusher.flush_color);
+        while (true) {
+                struct wq_flusher *next, *tmp;
+                /* complete all the flushers sharing the current flush color */
+                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+                        if (next->flush_color != wq->flush_color)
+                                break;
+                        list_del_init(&next->list);
+                        complete(&next->done);
+                }
+                BUG_ON(!list_empty(&wq->flusher_overflow) &&
+                       wq->flush_color != work_next_color(wq->work_color));
+                /* this flush_color is finished, advance by one */
+                wq->flush_color = work_next_color(wq->flush_color);
+                /* one color has been freed, handle overflow queue */
+                if (!list_empty(&wq->flusher_overflow)) {
+                        /*
+                         * Assign the same color to all overflowed
+                         * flushers, advance work_color and append to
+                         * flusher_queue.  This is the start-to-wait
+                         * phase for these overflowed flushers.
+                         */
+                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
+                                tmp->flush_color = wq->work_color;
+                        wq->work_color = work_next_color(wq->work_color);
+                        list_splice_tail_init(&wq->flusher_overflow,
+                                              &wq->flusher_queue);
+                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+                }
+                if (list_empty(&wq->flusher_queue)) {
+                        BUG_ON(wq->flush_color != wq->work_color);
+                        break;
+                }
+                /*
+                 * Need to flush more colors.  Make the next flusher
+                 * the new first flusher and arm cwqs.
+                 */
+                BUG_ON(wq->flush_color == wq->work_color);
+                BUG_ON(wq->flush_color != next->flush_color);
+                list_del_init(&next->list);
+                wq->first_flusher = next;
+                if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+                        break;
+                /*
+                 * Meh... this color is already done, clear first
+                 * flusher and repeat cascading.
+                 */
+                wq->first_flusher = NULL;
+        }
+out_unlock:
+        mutex_unlock(&wq->flush_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
-/**
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
- * flush_work - block until a work_struct's callback has terminated
+                             bool wait_executing)
- * @work: the work which is to be flushed
- *
- * Returns false if @work has already terminated.
- *
- * It is expected that, prior to calling flush_work(), the caller has
- * arranged for the work to not be requeued, otherwise it doesn't make
- * sense to use this function.
- */
-int flush_work(struct work_struct *work)
 {
+        struct worker *worker = NULL;
+        struct global_cwq *gcwq;
        struct cpu_workqueue_struct *cwq;
-        struct list_head *prev;
-        struct wq_barrier barr;
        might_sleep();
-        cwq = get_wq_data(work);
+        gcwq = get_work_gcwq(work);
-        if (!cwq)
+        if (!gcwq)
-                return 0;
+                return false;
-        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_map_release(&cwq->wq->lockdep_map);
-        prev = NULL;
+        spin_lock_irq(&gcwq->lock);
-        spin_lock_irq(&cwq->lock);
        if (!list_empty(&work->entry)) {
                /*
                 * See the comment near try_to_grab_pending()->smp_rmb().
-                 * If it was re-queued under us we are not going to wait.
+                 * If it was re-queued to a different gcwq under us, we
+                 * are not going to wait.
                 */
                smp_rmb();
-                if (unlikely(cwq != get_wq_data(work)))
+                cwq = get_work_cwq(work);
-                        goto out;
+                if (unlikely(!cwq || gcwq != cwq->gcwq))
-                prev = &work->entry;
+                        goto already_gone;
-        } else {
+        } else if (wait_executing) {
-                if (cwq->current_work != work)
+                worker = find_worker_executing_work(gcwq, work);
-                        goto out;
+                if (!worker)
-                prev = &cwq->worklist;
+                        goto already_gone;
-        }
+                cwq = worker->current_cwq;
-        insert_wq_barrier(cwq, &barr, prev->next);
+        } else
-out:
+                goto already_gone;
-        spin_unlock_irq(&cwq->lock);
-        if (!prev)
+        insert_wq_barrier(cwq, barr, work, worker);
-                return 0;
+        spin_unlock_irq(&gcwq->lock);
-        wait_for_completion(&barr.done);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        destroy_work_on_stack(&barr.work);
+        lock_map_release(&cwq->wq->lockdep_map);
-        return 1;
+        return true;
+already_gone:
+        spin_unlock_irq(&gcwq->lock);
+        return false;
+}
+/**
+ * flush_work - wait for a work to finish executing the last queueing instance
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution.  This function considers
+ * only the last queueing instance of @work.  If @work has been
+ * enqueued across different CPUs on a non-reentrant workqueue or on
+ * multiple workqueues, @work might still be executing on return on
+ * some of the CPUs from earlier queueing.
+ *
+ * If @work was queued only on a non-reentrant, ordered or unbound
+ * workqueue, @work is guaranteed to be idle on return if it hasn't
+ * been requeued since flush started.
+ *
+ * RETURNS:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        if (start_flush_work(work, &barr, true)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else
+                return false;
 }
 EXPORT_SYMBOL_GPL(flush_work);
+static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
+{
+        struct wq_barrier barr;
+        struct worker *worker;
+        spin_lock_irq(&gcwq->lock);
+        worker = find_worker_executing_work(gcwq, work);
+        if (unlikely(worker))
+                insert_wq_barrier(worker->current_cwq, &barr, work, worker);
+        spin_unlock_irq(&gcwq->lock);
+        if (unlikely(worker)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else
+                return false;
+}
+static bool wait_on_work(struct work_struct *work)
+{
+        bool ret = false;
+        int cpu;
+        might_sleep();
+        lock_map_acquire(&work->lockdep_map);
+        lock_map_release(&work->lockdep_map);
+        for_each_gcwq_cpu(cpu)
+                ret |= wait_on_cpu_work(get_gcwq(cpu), work);
+        return ret;
+}
+/**
+ * flush_work_sync - wait until a work has finished execution
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution.  On return, it's
+ * guaranteed that all queueing instances of @work which happened
+ * before this function is called are finished.  In other words, if
+ * @work hasn't been requeued since this function was called, @work is
+ * guaranteed to be idle on return.
+ *
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work_sync(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        bool pending, waited;
+        /* we'll wait for executions separately, queue barr only if pending */
+        pending = start_flush_work(work, &barr, false);
+        /* wait for executions to finish */
+        waited = wait_on_work(work);
+        /* wait for the pending one */
+        if (pending) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
+        return pending || waited;
+}
+EXPORT_SYMBOL_GPL(flush_work_sync);
 /*
 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
 * so this work can't be re-armed in any way.
 */
 static int try_to_grab_pending(struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq;
+        struct global_cwq *gcwq;
        int ret = -1;
-        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
+        gcwq = get_work_gcwq(work);
-        cwq = get_wq_data(work);
+        if (!gcwq)
-        if (!cwq)
                return ret;
-        spin_lock_irq(&cwq->lock);
+        spin_lock_irq(&gcwq->lock);
        if (!list_empty(&work->entry)) {
                /*
-                 * This work is queued, but perhaps we locked the wrong cwq.
+                 * This work is queued, but perhaps we locked the wrong gcwq.
                 * In that case we must see the new value after rmb(), see
                 * insert_work()->wmb().
                 */
                smp_rmb();
-                if (cwq == get_wq_data(work)) {
+                if (gcwq == get_work_gcwq(work)) {
                        debug_work_deactivate(work);
                        list_del_init(&work->entry);
+                        cwq_dec_nr_in_flight(get_work_cwq(work),
+                                get_work_color(work),
+                                *work_data_bits(work) & WORK_STRUCT_DELAYED);
                        ret = 1;
                }
        }
-        spin_unlock_irq(&cwq->lock);
+        spin_unlock_irq(&gcwq->lock);
        return ret;
 }
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
+static bool __cancel_work_timer(struct work_struct *work,
-                                struct work_struct *work)
-{
-        struct wq_barrier barr;
-        int running = 0;
-        spin_lock_irq(&cwq->lock);
-        if (unlikely(cwq->current_work == work)) {
-                insert_wq_barrier(cwq, &barr, cwq->worklist.next);
-                running = 1;
-        }
-        spin_unlock_irq(&cwq->lock);
-        if (unlikely(running)) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-        }
-}
-static void wait_on_work(struct work_struct *work)
-{
-        struct cpu_workqueue_struct *cwq;
-        struct workqueue_struct *wq;
-        const struct cpumask *cpu_map;
-        int cpu;
-        might_sleep();
-        lock_map_acquire(&work->lockdep_map);
-        lock_map_release(&work->lockdep_map);
-        cwq = get_wq_data(work);
-        if (!cwq)
-                return;
-        wq = cwq->wq;
-        cpu_map = wq_cpu_map(wq);
-        for_each_cpu(cpu, cpu_map)
-                wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
-}
-static int __cancel_work_timer(struct work_struct *work,
                                struct timer_list* timer)
 {
        int ret;
@@ -681,54 +2514,91 @@ static int __cancel_work_timer(struct work_struct *work,
                wait_on_work(work);
        } while (unlikely(ret < 0));
-        clear_wq_data(work);
+        clear_work_data(work);
        return ret;
 }
 /**
- * cancel_work_sync - block until a work_struct's callback has terminated
+ * cancel_work_sync - cancel a work and wait for it to finish
- * @work: the work which is to be flushed
+ * @work: the work to cancel
- *
- * Returns true if @work was pending.
 *
- * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * Cancel @work and wait for its execution to finish.  This function
- * callback appears to be running, cancel_work_sync() will block until it
+ * can be used even if the work re-queues itself or migrates to
- * has completed.
+ * another workqueue.  On return from this function, @work is
+ * guaranteed to be not pending or executing on any CPU.
 *
- * It is possible to use this function if the work re-queues itself. It can
+ * cancel_work_sync(&delayed_work->work) must not be used for
- * cancel the work even if it migrates to another workqueue, however in that
+ * delayed_work's.  Use cancel_delayed_work_sync() instead.
- * case it only guarantees that work->func() has completed on the last queued
- * workqueue.
- *
- * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
- * pending, otherwise it goes into a busy-wait loop until the timer expires.
 *
- * The caller must ensure that workqueue_struct on which this work was last
+ * The caller must ensure that the workqueue on which @work was last
 * queued can't be destroyed before this function returns.
+ *
+ * RETURNS:
+ * %true if @work was pending, %false otherwise.
 */
-int cancel_work_sync(struct work_struct *work)
+bool cancel_work_sync(struct work_struct *work)
 {
        return __cancel_work_timer(work, NULL);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 /**
- * cancel_delayed_work_sync - reliably kill off a delayed work.
+ * flush_delayed_work - wait for a dwork to finish executing the last queueing
- * @dwork: the delayed work struct
+ * @dwork: the delayed work to flush
 *
- * Returns true if @dwork was pending.
+ * Delayed timer is cancelled and the pending work is queued for
+ * immediate execution.  Like flush_work(), this function only
+ * considers the last queueing instance of @dwork.
 *
- * It is possible to use this function if @dwork rearms itself via queue_work()
+ * RETURNS:
- * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
 */
-int cancel_delayed_work_sync(struct delayed_work *dwork)
+bool flush_delayed_work(struct delayed_work *dwork)
+{
+        if (del_timer_sync(&dwork->timer))
+                __queue_work(raw_smp_processor_id(),
+                             get_work_cwq(&dwork->work)->wq, &dwork->work);
+        return flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+/**
+ * flush_delayed_work_sync - wait for a dwork to finish
+ * @dwork: the delayed work to flush
+ *
+ * Delayed timer is cancelled and the pending work is queued for
+ * execution immediately.  Other than timer handling, its behavior
+ * is identical to flush_work_sync().
+ *
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_delayed_work_sync(struct delayed_work *dwork)
+{
+        if (del_timer_sync(&dwork->timer))
+                __queue_work(raw_smp_processor_id(),
+                             get_work_cwq(&dwork->work)->wq, &dwork->work);
+        return flush_work_sync(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work_sync);
+/**
+ * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
+ * @dwork: the delayed work cancel
+ *
+ * This is cancel_work_sync() for delayed works.
+ *
+ * RETURNS:
+ * %true if @dwork was pending, %false otherwise.
+ */
+bool cancel_delayed_work_sync(struct delayed_work *dwork)
 {
        return __cancel_work_timer(&dwork->work, &dwork->timer);
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
-static struct workqueue_struct *keventd_wq __read_mostly;
 /**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
@@ -742,7 +2612,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
 */
 int schedule_work(struct work_struct *work)
 {
-        return queue_work(keventd_wq, work);
+        return queue_work(system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
@@ -755,7 +2625,7 @@ EXPORT_SYMBOL(schedule_work);
 */
 int schedule_work_on(int cpu, struct work_struct *work)
 {
-        return queue_work_on(cpu, keventd_wq, work);
+        return queue_work_on(cpu, system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work_on);
@@ -770,29 +2640,11 @@ EXPORT_SYMBOL(schedule_work_on);
 int schedule_delayed_work(struct delayed_work *dwork,
                                        unsigned long delay)
 {
-        return queue_delayed_work(keventd_wq, dwork, delay);
+        return queue_delayed_work(system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 /**
- * flush_delayed_work - block until a dwork_struct's callback has terminated
- * @dwork: the delayed work which is to be flushed
- *
- * Any timeout is cancelled, and any pending work is run immediately.
- */
-void flush_delayed_work(struct delayed_work *dwork)
-{
-        if (del_timer_sync(&dwork->timer)) {
-                struct cpu_workqueue_struct *cwq;
-                cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
-                __queue_work(cwq, &dwork->work);
-                put_cpu();
-        }
-        flush_work(&dwork->work);
-}
-EXPORT_SYMBOL(flush_delayed_work);
-/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
@@ -804,24 +2656,25 @@ EXPORT_SYMBOL(flush_delayed_work);
 int schedule_delayed_work_on(int cpu,
                        struct delayed_work *dwork, unsigned long delay)
 {
-        return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
- * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
- * Returns zero on success.
+ * schedule_on_each_cpu() executes @func on each online CPU using the
- * Returns -ve errno on failure.
+ * system workqueue and blocks until all CPUs have completed.
- *
 * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
 */
 int schedule_on_each_cpu(work_func_t func)
 {
        int cpu;
-        int orig = -1;
+        struct work_struct __percpu *works;
-        struct work_struct *works;
        works = alloc_percpu(struct work_struct);
        if (!works)
@@ -829,23 +2682,12 @@ int schedule_on_each_cpu(work_func_t func)
        get_online_cpus();
-        /*
-         * When running in keventd don't schedule a work item on
-         * itself.  Can just call directly because the work queue is
-         * already bound.  This also is faster.
-         */
-        if (current_is_keventd())
-                orig = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);
                INIT_WORK(work, func);
-                if (cpu != orig)
+                schedule_work_on(cpu, work);
-                        schedule_work_on(cpu, work);
        }
-        if (orig >= 0)
-                func(per_cpu_ptr(works, orig));
        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2723,7 @@ int schedule_on_each_cpu(work_func_t func)
 */
 void flush_scheduled_work(void)
 {
-        flush_workqueue(keventd_wq);
+        flush_workqueue(system_wq);
 }
 EXPORT_SYMBOL(flush_scheduled_work);
@@ -913,170 +2755,178 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
 int keventd_up(void)
 {
-        return keventd_wq != NULL;
+        return system_wq != NULL;
 }
-int current_is_keventd(void)
+static int alloc_cwqs(struct workqueue_struct *wq)
 {
-        struct cpu_workqueue_struct *cwq;
+        /*
-        int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
+         * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
-        int ret = 0;
+         * Make sure that the alignment isn't lower than that of
+         * unsigned long long.
-        BUG_ON(!keventd_wq);
+         */
+        const size_t size = sizeof(struct cpu_workqueue_struct);
+        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+                                   __alignof__(unsigned long long));
+#ifdef CONFIG_SMP
+        bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+        bool percpu = false;
+#endif
-        cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
+        if (percpu)
-        if (current == cwq->thread)
+                wq->cpu_wq.pcpu = __alloc_percpu(size, align);
-                ret = 1;
+        else {
+                void *ptr;
-        return ret;
+                /*
+                 * Allocate enough room to align cwq and put an extra
+                 * pointer at the end pointing back to the originally
+                 * allocated pointer which will be used for free.
+                 */
+                ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+                if (ptr) {
+                        wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+                        *(void **)(wq->cpu_wq.single + 1) = ptr;
+                }
+        }
+        /* just in case, make sure it's actually aligned
+         * - this is affected by PERCPU() alignment in vmlinux.lds.S
+         */
+        BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+        return wq->cpu_wq.v ? 0 : -ENOMEM;
 }
-static struct cpu_workqueue_struct *
+static void free_cwqs(struct workqueue_struct *wq)
-init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 {
-        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+#ifdef CONFIG_SMP
+        bool percpu = !(wq->flags & WQ_UNBOUND);
-        cwq->wq = wq;
+#else
-        spin_lock_init(&cwq->lock);
+        bool percpu = false;
-        INIT_LIST_HEAD(&cwq->worklist);
+#endif
-        init_waitqueue_head(&cwq->more_work);
-        return cwq;
+        if (percpu)
+                free_percpu(wq->cpu_wq.pcpu);
+        else if (wq->cpu_wq.single) {
+                /* the pointer to free is stored right after the cwq */
+                kfree(*(void **)(wq->cpu_wq.single + 1));
+        }
 }
-static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+                               const char *name)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
-        struct workqueue_struct *wq = cwq->wq;
-        const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
-        struct task_struct *p;
-        p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+        if (max_active < 1 || max_active > lim)
-        /*
+                printk(KERN_WARNING "workqueue: max_active %d requested for %s "
-         * Nobody can add the work_struct to this cwq,
+                       "is out of range, clamping between %d and %d\n",
-         *      if (caller is __create_workqueue)
+                       max_active, name, 1, lim);
-         *              nobody should see this wq
-         *      else // caller is CPU_UP_PREPARE
-         *              cpu is not on cpu_online_map
-         * so we can abort safely.
-         */
-        if (IS_ERR(p))
-                return PTR_ERR(p);
-        if (cwq->wq->rt)
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-        cwq->thread = p;
-        trace_workqueue_creation(cwq->thread, cpu);
+        return clamp_val(max_active, 1, lim);
-        return 0;
 }
-static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
+                                               unsigned int flags,
+                                               int max_active,
+                                               struct lock_class_key *key,
+                                               const char *lock_name)
 {
-        struct task_struct *p = cwq->thread;
+        struct workqueue_struct *wq;
+        unsigned int cpu;
-        if (p != NULL) {
+        /*
-                if (cpu >= 0)
+         * Workqueues which may be used during memory reclaim should
-                        kthread_bind(p, cpu);
+         * have a rescuer to guarantee forward progress.
-                wake_up_process(p);
+         */
-        }
+        if (flags & WQ_MEM_RECLAIM)
-}
+                flags |= WQ_RESCUER;
-struct workqueue_struct *__create_workqueue_key(const char *name,
+        /*
-                                                int singlethread,
+         * Unbound workqueues aren't concurrency managed and should be
-                                                int freezeable,
+         * dispatched to workers immediately.
-                                                int rt,
+         */
-                                                struct lock_class_key *key,
+        if (flags & WQ_UNBOUND)
-                                                const char *lock_name)
+                flags |= WQ_HIGHPRI;
-{
-        struct workqueue_struct *wq;
+        max_active = max_active ?: WQ_DFL_ACTIVE;
-        struct cpu_workqueue_struct *cwq;
+        max_active = wq_clamp_max_active(max_active, flags, name);
-        int err = 0, cpu;
        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
        if (!wq)
-                return NULL;
+                goto err;
-        wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+        wq->flags = flags;
-        if (!wq->cpu_wq) {
+        wq->saved_max_active = max_active;
-                kfree(wq);
+        mutex_init(&wq->flush_mutex);
-                return NULL;
+        atomic_set(&wq->nr_cwqs_to_flush, 0);
-        }
+        INIT_LIST_HEAD(&wq->flusher_queue);
+        INIT_LIST_HEAD(&wq->flusher_overflow);
        wq->name = name;
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
-        wq->singlethread = singlethread;
-        wq->freezeable = freezeable;
-        wq->rt = rt;
        INIT_LIST_HEAD(&wq->list);
-        if (singlethread) {
+        if (alloc_cwqs(wq) < 0)
-                cwq = init_cpu_workqueue(wq, singlethread_cpu);
+                goto err;
-                err = create_workqueue_thread(cwq, singlethread_cpu);
-                start_workqueue_thread(cwq, -1);
+        for_each_cwq_cpu(cpu, wq) {
-        } else {
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                cpu_maps_update_begin();
+                struct global_cwq *gcwq = get_gcwq(cpu);
-                /*
-                 * We must place this wq on list even if the code below fails.
+                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-                 * cpu_down(cpu) can remove cpu from cpu_populated_map before
+                cwq->gcwq = gcwq;
-                 * destroy_workqueue() takes the lock, in that case we leak
+                cwq->wq = wq;
-                 * cwq[cpu]->thread.
+                cwq->flush_color = -1;
-                 */
+                cwq->max_active = max_active;
-                spin_lock(&workqueue_lock);
+                INIT_LIST_HEAD(&cwq->delayed_works);
-                list_add(&wq->list, &workqueues);
-                spin_unlock(&workqueue_lock);
-                /*
-                 * We must initialize cwqs for each possible cpu even if we
-                 * are going to call destroy_workqueue() finally. Otherwise
-                 * cpu_up() can hit the uninitialized cwq once we drop the
-                 * lock.
-                 */
-                for_each_possible_cpu(cpu) {
-                        cwq = init_cpu_workqueue(wq, cpu);
-                        if (err || !cpu_online(cpu))
-                                continue;
-                        err = create_workqueue_thread(cwq, cpu);
-                        start_workqueue_thread(cwq, cpu);
-                }
-                cpu_maps_update_done();
        }
-        if (err) {
+        if (flags & WQ_RESCUER) {
-                destroy_workqueue(wq);
+                struct worker *rescuer;
-                wq = NULL;
+                if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
+                        goto err;
+                wq->rescuer = rescuer = alloc_worker();
+                if (!rescuer)
+                        goto err;
+                rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+                if (IS_ERR(rescuer->task))
+                        goto err;
+                rescuer->task->flags |= PF_THREAD_BOUND;
+                wake_up_process(rescuer->task);
        }
-        return wq;
-}
-EXPORT_SYMBOL_GPL(__create_workqueue_key);
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
-{
        /*
-         * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
+         * workqueue_lock protects global freeze state and workqueues
-         * cpu_add_remove_lock protects cwq->thread.
+         * list.  Grab it, set max_active accordingly and add the new
+         * workqueue to workqueues list.
         */
-        if (cwq->thread == NULL)
+        spin_lock(&workqueue_lock);
-                return;
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
-        lock_map_release(&cwq->wq->lockdep_map);
+                for_each_cwq_cpu(cpu, wq)
+                        get_cwq(cpu, wq)->max_active = 0;
-        flush_cpu_workqueue(cwq);
+        list_add(&wq->list, &workqueues);
-        /*
-         * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
+        spin_unlock(&workqueue_lock);
-         * a concurrent flush_workqueue() can insert a barrier after us.
-         * However, in that case run_workqueue() won't return and check
+        return wq;
-         * kthread_should_stop() until it flushes all work_struct's.
+err:
-         * When ->worklist becomes empty it is safe to exit because no
+        if (wq) {
-         * more work_structs can be queued on this cwq: flush_workqueue
+                free_cwqs(wq);
-         * checks list_empty(), and a "normal" queue_work() can't use
+                free_mayday_mask(wq->mayday_mask);
-         * a dead CPU.
+                kfree(wq->rescuer);
-         */
+                kfree(wq);
-        trace_workqueue_destruction(cwq->thread);
+        }
-        kthread_stop(cwq->thread);
+        return NULL;
-        cwq->thread = NULL;
 }
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 /**
 * destroy_workqueue - safely terminate a workqueue
@@ -1086,72 +2936,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-        const struct cpumask *cpu_map = wq_cpu_map(wq);
+        unsigned int cpu;
-        int cpu;
+        wq->flags |= WQ_DYING;
+        flush_workqueue(wq);
-        cpu_maps_update_begin();
+        /*
+         * wq list is used to freeze wq, remove from list after
+         * flushing is complete in case freeze races us.
+         */
        spin_lock(&workqueue_lock);
        list_del(&wq->list);
        spin_unlock(&workqueue_lock);
-        for_each_cpu(cpu, cpu_map)
+        /* sanity check */
-                cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
+        for_each_cwq_cpu(cpu, wq) {
-        cpu_maps_update_done();
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                int i;
+                for (i = 0; i < WORK_NR_COLORS; i++)
+                        BUG_ON(cwq->nr_in_flight[i]);
+                BUG_ON(cwq->nr_active);
+                BUG_ON(!list_empty(&cwq->delayed_works));
+        }
+        if (wq->flags & WQ_RESCUER) {
+                kthread_stop(wq->rescuer->task);
+                free_mayday_mask(wq->mayday_mask);
+                kfree(wq->rescuer);
+        }
-        free_percpu(wq->cpu_wq);
+        free_cwqs(wq);
        kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+        unsigned int cpu;
+        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
+        spin_lock(&workqueue_lock);
+        wq->saved_max_active = max_active;
+        for_each_cwq_cpu(cpu, wq) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                spin_lock_irq(&gcwq->lock);
+                if (!(wq->flags & WQ_FREEZEABLE) ||
+                    !(gcwq->flags & GCWQ_FREEZING))
+                        get_cwq(gcwq->cpu, wq)->max_active = max_active;
+                spin_unlock_irq(&gcwq->lock);
+        }
+        spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested.  There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * RETURNS:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+{
+        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+        return !list_empty(&cwq->delayed_works);
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+/**
+ * work_cpu - return the last known associated cpu for @work
+ * @work: the work of interest
+ *
+ * RETURNS:
+ * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
+ */
+unsigned int work_cpu(struct work_struct *work)
+{
+        struct global_cwq *gcwq = get_work_gcwq(work);
+        return gcwq ? gcwq->cpu : WORK_CPU_NONE;
+}
+EXPORT_SYMBOL_GPL(work_cpu);
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running.  There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ * Especially for reentrant wqs, the pending state might hide the
+ * running state.
+ *
+ * RETURNS:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+        struct global_cwq *gcwq = get_work_gcwq(work);
+        unsigned long flags;
+        unsigned int ret = 0;
+        if (!gcwq)
+                return false;
+        spin_lock_irqsave(&gcwq->lock, flags);
+        if (work_pending(work))
+                ret |= WORK_BUSY_PENDING;
+        if (find_worker_executing_work(gcwq, work))
+                ret |= WORK_BUSY_RUNNING;
+        spin_unlock_irqrestore(&gcwq->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug.  Firstly, there
+ * are a lot of assumptions on strong associations among work, cwq and
+ * gcwq which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths.  Secondly,
+ * gcwqs serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online.  A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *              new trustee is started with this state.
+ *
+ * IN_CHARGE    Once started, trustee will enter this state after
+ *              assuming the manager role and making all existing
+ *              workers rogue.  DOWN_PREPARE waits for trustee to
+ *              enter this state.  After reaching IN_CHARGE, trustee
+ *              tries to execute the pending worklist until it's empty
+ *              and the state is set to BUTCHER, or the state is set
+ *              to RELEASE.
+ *
+ * BUTCHER      Command state which is set by the cpu callback after
+ *              the cpu has went down.  Once this state is set trustee
+ *              knows that there will be no new works on the worklist
+ *              and once the worklist is empty it can proceed to
+ *              killing idle workers.
+ *
+ * RELEASE      Command state which is set by the cpu callback if the
+ *              cpu down has been canceled or it has come online
+ *              again.  After recognizing this state, trustee stops
+ *              trying to drain or butcher and clears ROGUE, rebinds
+ *              all remaining workers back to the cpu and releases
+ *              manager role.
+ *
+ * DONE         Trustee will enter this state after BUTCHER or RELEASE
+ *              is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
+ */
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use.  Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({                    \
+        long __ret = (timeout);                                         \
+        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
+               __ret) {                                                 \
+                spin_unlock_irq(&gcwq->lock);                           \
+                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
+                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
+                        __ret);                                         \
+                spin_lock_irq(&gcwq->lock);                             \
+        }                                                               \
+        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
+})
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({                                     \
+        long __ret1;                                                    \
+        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+        __ret1 < 0 ? -1 : 0;                                            \
+})
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+        struct global_cwq *gcwq = __gcwq;
+        struct worker *worker;
+        struct work_struct *work;
+        struct hlist_node *pos;
+        long rc;
+        int i;
+        BUG_ON(gcwq->cpu != smp_processor_id());
+        spin_lock_irq(&gcwq->lock);
+        /*
+         * Claim the manager position and make all workers rogue.
+         * Trustee must be bound to the target cpu and can't be
+         * cancelled.
+         */
+        BUG_ON(gcwq->cpu != smp_processor_id());
+        rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+        BUG_ON(rc < 0);
+        gcwq->flags |= GCWQ_MANAGING_WORKERS;
+        list_for_each_entry(worker, &gcwq->idle_list, entry)
+                worker->flags |= WORKER_ROGUE;
+        for_each_busy_worker(worker, i, pos, gcwq)
+                worker->flags |= WORKER_ROGUE;
+        /*
+         * Call schedule() so that we cross rq->lock and thus can
+         * guarantee sched callbacks see the rogue flag.  This is
+         * necessary as scheduler callbacks may be invoked from other
+         * cpus.
+         */
+        spin_unlock_irq(&gcwq->lock);
+        schedule();
+        spin_lock_irq(&gcwq->lock);
+        /*
+         * Sched callbacks are disabled now.  Zap nr_running.  After
+         * this, nr_running stays zero and need_more_worker() and
+         * keep_working() are always true as long as the worklist is
+         * not empty.
+         */
+        atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+        spin_unlock_irq(&gcwq->lock);
+        del_timer_sync(&gcwq->idle_timer);
+        spin_lock_irq(&gcwq->lock);
+        /*
+         * We're now in charge.  Notify and proceed to drain.  We need
+         * to keep the gcwq running during the whole CPU down
+         * procedure as other cpu hotunplug callbacks may need to
+         * flush currently running tasks.
+         */
+        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+        wake_up_all(&gcwq->trustee_wait);
+        /*
+         * The original cpu is in the process of dying and may go away
+         * anytime now.  When that happens, we and all workers would
+         * be migrated to other cpus.  Try draining any left work.  We
+         * want to get it over with ASAP - spam rescuers, wake up as
+         * many idlers as necessary and create new ones till the
+         * worklist is empty.  Note that if the gcwq is frozen, there
+         * may be frozen works in freezeable cwqs.  Don't declare
+         * completion while frozen.
+         */
+        while (gcwq->nr_workers != gcwq->nr_idle ||
+               gcwq->flags & GCWQ_FREEZING ||
+               gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+                int nr_works = 0;
+                list_for_each_entry(work, &gcwq->worklist, entry) {
+                        send_mayday(work);
+                        nr_works++;
+                }
+                list_for_each_entry(worker, &gcwq->idle_list, entry) {
+                        if (!nr_works--)
+                                break;
+                        wake_up_process(worker->task);
+                }
+                if (need_to_create_worker(gcwq)) {
+                        spin_unlock_irq(&gcwq->lock);
+                        worker = create_worker(gcwq, false);
+                        spin_lock_irq(&gcwq->lock);
+                        if (worker) {
+                                worker->flags |= WORKER_ROGUE;
+                                start_worker(worker);
+                        }
+                }
+                /* give a breather */
+                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+                        break;
+        }
+        /*
+         * Either all works have been scheduled and cpu is down, or
+         * cpu down has already been canceled.  Wait for and butcher
+         * all workers till we're canceled.
+         */
+        do {
+                rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+                while (!list_empty(&gcwq->idle_list))
+                        destroy_worker(list_first_entry(&gcwq->idle_list,
+                                                        struct worker, entry));
+        } while (gcwq->nr_workers && rc >= 0);
+        /*
+         * At this point, either draining has completed and no worker
+         * is left, or cpu down has been canceled or the cpu is being
+         * brought back up.  There shouldn't be any idle one left.
+         * Tell the remaining busy ones to rebind once it finishes the
+         * currently scheduled works by scheduling the rebind_work.
+         */
+        WARN_ON(!list_empty(&gcwq->idle_list));
+        for_each_busy_worker(worker, i, pos, gcwq) {
+                struct work_struct *rebind_work = &worker->rebind_work;
+                /*
+                 * Rebind_work may race with future cpu hotplug
+                 * operations.  Use a separate flag to mark that
+                 * rebinding is scheduled.
+                 */
+                worker->flags |= WORKER_REBIND;
+                worker->flags &= ~WORKER_ROGUE;
+                /* queue rebind_work, wq doesn't matter, use the default one */
+                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                     work_data_bits(rebind_work)))
+                        continue;
+                debug_work_activate(rebind_work);
+                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                            worker->scheduled.next,
+                            work_color_to_flags(WORK_NO_COLOR));
+        }
+        /* relinquish manager role */
+        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+        /* notify completion */
+        gcwq->trustee = NULL;
+        gcwq->trustee_state = TRUSTEE_DONE;
+        wake_up_all(&gcwq->trustee_wait);
+        spin_unlock_irq(&gcwq->lock);
+        return 0;
+}
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
+{
+        if (!(gcwq->trustee_state == state ||
+              gcwq->trustee_state == TRUSTEE_DONE)) {
+                spin_unlock_irq(&gcwq->lock);
+                __wait_event(gcwq->trustee_wait,
+                             gcwq->trustee_state == state ||
+                             gcwq->trustee_state == TRUSTEE_DONE);
+                spin_lock_irq(&gcwq->lock);
+        }
+}
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                                unsigned long action,
                                                void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
-        struct cpu_workqueue_struct *cwq;
+        struct global_cwq *gcwq = get_gcwq(cpu);
-        struct workqueue_struct *wq;
+        struct task_struct *new_trustee = NULL;
-        int err = 0;
+        struct worker *uninitialized_var(new_worker);
+        unsigned long flags;
        action &= ~CPU_TASKS_FROZEN;
        switch (action) {
+        case CPU_DOWN_PREPARE:
+                new_trustee = kthread_create(trustee_thread, gcwq,
+                                             "workqueue_trustee/%d\n", cpu);
+                if (IS_ERR(new_trustee))
+                        return notifier_from_errno(PTR_ERR(new_trustee));
+                kthread_bind(new_trustee, cpu);
+                /* fall through */
        case CPU_UP_PREPARE:
-                cpumask_set_cpu(cpu, cpu_populated_map);
+                BUG_ON(gcwq->first_idle);
-        }
+                new_worker = create_worker(gcwq, false);
-undo:
+                if (!new_worker) {
-        list_for_each_entry(wq, &workqueues, list) {
+                        if (new_trustee)
-                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+                                kthread_stop(new_trustee);
+                        return NOTIFY_BAD;
-                switch (action) {
-                case CPU_UP_PREPARE:
-                        err = create_workqueue_thread(cwq, cpu);
-                        if (!err)
-                                break;
-                        printk(KERN_ERR "workqueue [%s] for %i failed\n",
-                                wq->name, cpu);
-                        action = CPU_UP_CANCELED;
-                        err = -ENOMEM;
-                        goto undo;
-                case CPU_ONLINE:
-                        start_workqueue_thread(cwq, cpu);
-                        break;
-                case CPU_UP_CANCELED:
-                        start_workqueue_thread(cwq, -1);
-                case CPU_POST_DEAD:
-                        cleanup_workqueue_thread(cwq);
-                        break;
                }
        }
+        /* some are called w/ irq disabled, don't disturb irq status */
+        spin_lock_irqsave(&gcwq->lock, flags);
        switch (action) {
-        case CPU_UP_CANCELED:
+        case CPU_DOWN_PREPARE:
+                /* initialize trustee and tell it to acquire the gcwq */
+                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+                gcwq->trustee = new_trustee;
+                gcwq->trustee_state = TRUSTEE_START;
+                wake_up_process(gcwq->trustee);
+                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+                /* fall through */
+        case CPU_UP_PREPARE:
+                BUG_ON(gcwq->first_idle);
+                gcwq->first_idle = new_worker;
+                break;
+        case CPU_DYING:
+                /*
+                 * Before this, the trustee and all workers except for
+                 * the ones which are still executing works from
+                 * before the last CPU down must be on the cpu.  After
+                 * this, they'll all be diasporas.
+                 */
+                gcwq->flags |= GCWQ_DISASSOCIATED;
+                break;
        case CPU_POST_DEAD:
-                cpumask_clear_cpu(cpu, cpu_populated_map);
+                gcwq->trustee_state = TRUSTEE_BUTCHER;
+                /* fall through */
+        case CPU_UP_CANCELED:
+                destroy_worker(gcwq->first_idle);
+                gcwq->first_idle = NULL;
+                break;
+        case CPU_DOWN_FAILED:
+        case CPU_ONLINE:
+                gcwq->flags &= ~GCWQ_DISASSOCIATED;
+                if (gcwq->trustee_state != TRUSTEE_DONE) {
+                        gcwq->trustee_state = TRUSTEE_RELEASE;
+                        wake_up_process(gcwq->trustee);
+                        wait_trustee_state(gcwq, TRUSTEE_DONE);
+                }
+                /*
+                 * Trustee is done and there might be no worker left.
+                 * Put the first_idle in and request a real manager to
+                 * take a look.
+                 */
+                spin_unlock_irq(&gcwq->lock);
+                kthread_bind(gcwq->first_idle->task, cpu);
+                spin_lock_irq(&gcwq->lock);
+                gcwq->flags |= GCWQ_MANAGE_WORKERS;
+                start_worker(gcwq->first_idle);
+                gcwq->first_idle = NULL;
+                break;
        }
-        return notifier_from_errno(err);
+        spin_unlock_irqrestore(&gcwq->lock, flags);
+        return notifier_from_errno(0);
 }
 #ifdef CONFIG_SMP
@@ -1201,14 +3499,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 EXPORT_SYMBOL_GPL(work_on_cpu);
 #endif /* CONFIG_SMP */
-void __init init_workqueues(void)
+#ifdef CONFIG_FREEZER
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues.  After this function returns, all
+ * freezeable workqueues will queue new works to their frozen_works
+ * list instead of gcwq->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+        unsigned int cpu;
+        spin_lock(&workqueue_lock);
+        BUG_ON(workqueue_freezing);
+        workqueue_freezing = true;
+        for_each_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct workqueue_struct *wq;
+                spin_lock_irq(&gcwq->lock);
+                BUG_ON(gcwq->flags & GCWQ_FREEZING);
+                gcwq->flags |= GCWQ_FREEZING;
+                list_for_each_entry(wq, &workqueues, list) {
+                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                        if (cwq && wq->flags & WQ_FREEZEABLE)
+                                cwq->max_active = 0;
+                }
+                spin_unlock_irq(&gcwq->lock);
+        }
+        spin_unlock(&workqueue_lock);
+}
+/**
+ * freeze_workqueues_busy - are freezeable workqueues still busy?
+ *
+ * Check whether freezing is complete.  This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock.
+ *
+ * RETURNS:
+ * %true if some freezeable workqueues are still busy.  %false if
+ * freezing is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+        unsigned int cpu;
+        bool busy = false;
+        spin_lock(&workqueue_lock);
+        BUG_ON(!workqueue_freezing);
+        for_each_gcwq_cpu(cpu) {
+                struct workqueue_struct *wq;
+                /*
+                 * nr_active is monotonically decreasing.  It's safe
+                 * to peek without lock.
+                 */
+                list_for_each_entry(wq, &workqueues, list) {
+                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                        if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+                                continue;
+                        BUG_ON(cwq->nr_active < 0);
+                        if (cwq->nr_active) {
+                                busy = true;
+                                goto out_unlock;
+                        }
+                }
+        }
+out_unlock:
+        spin_unlock(&workqueue_lock);
+        return busy;
+}
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues.  Normal queueing is restored and all collected
+ * frozen works are transferred to their respective gcwq worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void thaw_workqueues(void)
 {
-        alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
+        unsigned int cpu;
-        cpumask_copy(cpu_populated_map, cpu_online_mask);
+        spin_lock(&workqueue_lock);
-        singlethread_cpu = cpumask_first(cpu_possible_mask);
-        cpu_singlethread_map = cpumask_of(singlethread_cpu);
+        if (!workqueue_freezing)
-        hotcpu_notifier(workqueue_cpu_callback, 0);
+                goto out_unlock;
-        keventd_wq = create_workqueue("events");
-        BUG_ON(!keventd_wq);
+        for_each_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct workqueue_struct *wq;
+                spin_lock_irq(&gcwq->lock);
+                BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+                gcwq->flags &= ~GCWQ_FREEZING;
+                list_for_each_entry(wq, &workqueues, list) {
+                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                        if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+                                continue;
+                        /* restore max_active and repopulate worklist */
+                        cwq->max_active = wq->saved_max_active;
+                        while (!list_empty(&cwq->delayed_works) &&
+                               cwq->nr_active < cwq->max_active)
+                                cwq_activate_first_delayed(cwq);
+                }
+                wake_up_worker(gcwq);
+                spin_unlock_irq(&gcwq->lock);
+        }
+        workqueue_freezing = false;
+out_unlock:
+        spin_unlock(&workqueue_lock);
+}
+#endif /* CONFIG_FREEZER */
+static int __init init_workqueues(void)
+{
+        unsigned int cpu;
+        int i;
+        cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+        /* initialize gcwqs */
+        for_each_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                spin_lock_init(&gcwq->lock);
+                INIT_LIST_HEAD(&gcwq->worklist);
+                gcwq->cpu = cpu;
+                gcwq->flags |= GCWQ_DISASSOCIATED;
+                INIT_LIST_HEAD(&gcwq->idle_list);
+                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
+                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
+                init_timer_deferrable(&gcwq->idle_timer);
+                gcwq->idle_timer.function = idle_worker_timeout;
+                gcwq->idle_timer.data = (unsigned long)gcwq;
+                setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+                            (unsigned long)gcwq);
+                ida_init(&gcwq->worker_ida);
+                gcwq->trustee_state = TRUSTEE_DONE;
+                init_waitqueue_head(&gcwq->trustee_wait);
+        }
+        /* create the initial worker */
+        for_each_online_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker *worker;
+                if (cpu != WORK_CPU_UNBOUND)
+                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
+                worker = create_worker(gcwq, true);
+                BUG_ON(!worker);
+                spin_lock_irq(&gcwq->lock);
+                start_worker(worker);
+                spin_unlock_irq(&gcwq->lock);
+        }
+        system_wq = alloc_workqueue("events", 0, 0);
+        system_long_wq = alloc_workqueue("events_long", 0, 0);
+        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+                                            WQ_UNBOUND_MAX_ACTIVE);
+        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+        return 0;
 }
+early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
+/*
+ * kernel/workqueue_sched.h
+ *
+ * Scheduler hooks for concurrency managed workqueue.  Only to be
+ * included from sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+                                       unsigned int cpu);