Merge 'Linux v3.0' into Litmus

Some notes: * Litmus^RT scheduling class is the topmost scheduling class (above stop_sched_class). * scheduler_ipi() function (e.g., in smp_reschedule_interrupt()) may increase IPI latencies. * Added path into schedule() to quickly re-evaluate scheduling decision without becoming preemptive again. This used to be a standard path before the removal of BKL. Conflicts: Makefile arch/arm/kernel/calls.S arch/arm/kernel/smp.c arch/x86/include/asm/unistd_32.h arch/x86/kernel/smp.c arch/x86/kernel/syscall_table_32.S include/linux/hrtimer.h kernel/printk.c kernel/sched.c kernel/sched_fair.c
author: Andrea Bastoni <bastoni@cs.unc.edu> 2011-08-27 09:43:54 -0400
committer: Andrea Bastoni <bastoni@cs.unc.edu> 2011-08-27 10:06:11 -0400
commit: 7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch)
tree: 5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel
parent: 7d754596756240fa918b94cd0c3011c77a638987 (diff)
parent: 02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff)
196 files changed, 24954 insertions, 11587 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 config MUTEX_SPIN_ON_OWNER
-        def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
+        def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,8 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o range.o
+            async.o range.o jump_label.o
-obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -22,7 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -43,7 +42,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -86,6 +84,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
 obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -99,11 +98,15 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_PERF_EVENTS) += events/
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -119,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
 # config_data.h contains the same information as ikconfig.h but gzipped.
 # Info from config_data can be extracted from /proc/config*
 targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
 quiet_cmd_ikconfiggz = IKCFG   $@
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int	audit_initialized;
 int             audit_enabled;
 int             audit_ever_enabled;
+EXPORT_SYMBOL_GPL(audit_enabled);
 /* Default state when kernel boots without any parameters. */
 static int      audit_default;
@@ -400,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        if (err < 0) {
                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
-                audit_log_lost("auditd dissapeared\n");
+                audit_log_lost("auditd disappeared\n");
                audit_pid = 0;
                /* we might get lucky and get this in the next auditd */
                audit_hold_skb(skb);
@@ -467,23 +469,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
        struct task_struct *tsk;
        int err;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
-        err = -ESRCH;
+        if (!tsk) {
-        if (!tsk)
+                rcu_read_unlock();
-                goto out;
+                return -ESRCH;
-        err = 0;
+        }
+        get_task_struct(tsk);
-        spin_lock_irq(&tsk->sighand->siglock);
+        rcu_read_unlock();
-        if (!tsk->signal->audit_tty)
+        err = tty_audit_push_task(tsk, loginuid, sessionid);
-                err = -EPERM;
+        put_task_struct(tsk);
-        spin_unlock_irq(&tsk->sighand->siglock);
-        if (err)
-                goto out;
-        tty_audit_push_task(tsk, loginuid, sessionid);
-out:
-        read_unlock(&tasklist_lock);
        return err;
 }
@@ -506,7 +501,7 @@ int audit_send_list(void *_dest)
 }
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
-                                 int multi, void *payload, int size)
+                                 int multi, const void *payload, int size)
 {
        struct sk_buff  *skb;
        struct nlmsghdr *nlh;
@@ -555,8 +550,8 @@ static int audit_send_reply_thread(void *arg)
 * Allocates an skb, builds the netlink message, and sends it to the pid.
 * No failure notifications.
 */
-void audit_send_reply(int pid, int seq, int type, int done, int multi,
+static void audit_send_reply(int pid, int seq, int type, int done, int multi,
-                      void *payload, int size)
+                             const void *payload, int size)
 {
        struct sk_buff *skb;
        struct task_struct *tsk;
@@ -678,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
-        loginuid = NETLINK_CB(skb).loginuid;
+        loginuid = audit_get_loginuid(current);
-        sessionid = NETLINK_CB(skb).sessionid;
+        sessionid = audit_get_sessionid(current);
-        sid  = NETLINK_CB(skb).sid;
+        security_task_getsecid(current, &sid);
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
@@ -880,40 +875,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
                struct task_struct *tsk;
+                unsigned long flags;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk)
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        err = -ESRCH;
-                else {
-                        spin_lock_irq(&tsk->sighand->siglock);
                        s.enabled = tsk->signal->audit_tty != 0;
-                        spin_unlock_irq(&tsk->sighand->siglock);
+                        unlock_task_sighand(tsk, &flags);
-                }
+                } else
-                read_unlock(&tasklist_lock);
+                        err = -ESRCH;
-                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
+                rcu_read_unlock();
-                                 &s, sizeof(s));
+                if (!err)
+                        audit_send_reply(NETLINK_CB(skb).pid, seq,
+                                         AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status *s;
                struct task_struct *tsk;
+                unsigned long flags;
                if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
                        return -EINVAL;
                s = data;
                if (s->enabled != 0 && s->enabled != 1)
                        return -EINVAL;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk)
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                        err = -ESRCH;
-                else {
-                        spin_lock_irq(&tsk->sighand->siglock);
                        tsk->signal->audit_tty = s->enabled != 0;
-                        spin_unlock_irq(&tsk->sighand->siglock);
+                        unlock_task_sighand(tsk, &flags);
-                }
+                } else
-                read_unlock(&tasklist_lock);
+                        err = -ESRCH;
+                rcu_read_unlock();
                break;
        }
        default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
                                    int *dirlen);
 extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
                                             int done, int multi,
-                                             void *payload, int size);
+                                             const void *payload, int size);
-extern void                 audit_send_reply(int pid, int seq, int type,
-                                             int done, int multi,
-                                             void *payload, int size);
 extern void                 audit_panic(const char *message);
 struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
 {
        struct audit_chunk *chunk = find_chunk(p);
        struct fsnotify_mark *entry = &chunk->mark;
-        struct audit_chunk *new;
+        struct audit_chunk *new = NULL;
        struct audit_tree *owner;
        int size = chunk->count - 1;
        int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
        spin_unlock(&hash_lock);
+        if (size)
+                new = alloc_chunk(size);
        spin_lock(&entry->lock);
        if (chunk->dead || !entry->i.inode) {
                spin_unlock(&entry->lock);
+                if (new)
+                        free_chunk(new);
                goto out;
        }
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
                goto out;
        }
-        new = alloc_chunk(size);
        if (!new)
                goto Fallback;
        fsnotify_duplicate_mark(&new->mark, entry);
        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
                free_chunk(new);
@@ -602,7 +607,7 @@ void audit_trim_trees(void)
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
                        struct audit_chunk *chunk = find_chunk(node);
-                        /* this could be NULL if the watch is dieing else where... */
+                        /* this could be NULL if the watch is dying else where... */
                        struct inode *inode = chunk->mark.i.inode;
                        node->index |= 1U<<31;
                        if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
 };
 /* fsnotify handle. */
-struct fsnotify_group *audit_watch_group;
+static struct fsnotify_group *audit_watch_group;
 /* fsnotify events we care about. */
 #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
        }
 }
-void audit_remove_watch(struct audit_watch *watch)
+static void audit_remove_watch(struct audit_watch *watch)
 {
        list_del(&watch->wlist);
        audit_put_parent(watch->parent);
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 }
 /* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
 {
-        struct inode *inode = ndp->path.dentry->d_inode;
+        struct inode *inode = path->dentry->d_inode;
        struct audit_parent *parent;
        int ret;
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 }
 /* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-        struct nameidata *ndparent, *ndwatch;
+        struct nameidata nd;
+        struct dentry *d;
        int err;
-        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+        err = kern_path_parent(watch->path, &nd);
-        if (unlikely(!ndparent))
+        if (err)
-                return -ENOMEM;
+                return err;
-        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+        if (nd.last_type != LAST_NORM) {
-        if (unlikely(!ndwatch)) {
+                path_put(&nd.path);
-                kfree(ndparent);
+                return -EINVAL;
-                return -ENOMEM;
        }
-        err = path_lookup(path, LOOKUP_PARENT, ndparent);
+        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-        if (err) {
+        d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-                kfree(ndparent);
+        if (IS_ERR(d)) {
-                kfree(ndwatch);
+                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-                return err;
+                path_put(&nd.path);
+                return PTR_ERR(d);
        }
+        if (d->d_inode) {
-        err = path_lookup(path, 0, ndwatch);
+                /* update watch filter fields */
-        if (err) {
+                watch->dev = d->d_inode->i_sb->s_dev;
-                kfree(ndwatch);
+                watch->ino = d->d_inode->i_ino;
-                ndwatch = NULL;
        }
+        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-        *ndp = ndparent;
+        *parent = nd.path;
-        *ndw = ndwatch;
+        dput(d);
        return 0;
 }
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-        if (ndp) {
-                path_put(&ndp->path);
-                kfree(ndp);
-        }
-        if (ndw) {
-                path_put(&ndw->path);
-                kfree(ndw);
-        }
-}
 /* Associate the given rule with an existing parent.
 * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
        struct audit_watch *watch = krule->watch;
        struct audit_parent *parent;
-        struct nameidata *ndp = NULL, *ndw = NULL;
+        struct path parent_path;
        int h, ret = 0;
        mutex_unlock(&audit_filter_mutex);
        /* Avoid calling path_lookup under audit_filter_mutex. */
-        ret = audit_get_nd(watch->path, &ndp, &ndw);
+        ret = audit_get_nd(watch, &parent_path);
-        if (ret) {
-                /* caller expects mutex locked */
-                mutex_lock(&audit_filter_mutex);
-                goto error;
-        }
+        /* caller expects mutex locked */
        mutex_lock(&audit_filter_mutex);
-        /* update watch filter fields */
+        if (ret)
-        if (ndw) {
+                return ret;
-                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-                watch->ino = ndw->path.dentry->d_inode->i_ino;
-        }
        /* either find an old parent or attach a new one */
-        parent = audit_find_parent(ndp->path.dentry->d_inode);
+        parent = audit_find_parent(parent_path.dentry->d_inode);
        if (!parent) {
-                parent = audit_init_parent(ndp);
+                parent = audit_init_parent(&parent_path);
                if (IS_ERR(parent)) {
                        ret = PTR_ERR(parent);
                        goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
        h = audit_hash_ino((u32)watch->ino);
        *list = &audit_inode_hash[h];
 error:
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
+        path_put(&parent_path);
        return ret;
 }
 void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                int result = 0;
+                u32 sid;
                switch (f->type) {
                case AUDIT_PID:
@@ -1250,7 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                        result = audit_comparator(cb->creds.gid, f->op, f->val);
                        break;
                case AUDIT_LOGINUID:
-                        result = audit_comparator(cb->loginuid, f->op, f->val);
+                        result = audit_comparator(audit_get_loginuid(current),
+                                                  f->op, f->val);
+                        break;
+                case AUDIT_SUBJ_USER:
+                case AUDIT_SUBJ_ROLE:
+                case AUDIT_SUBJ_TYPE:
+                case AUDIT_SUBJ_SEN:
+                case AUDIT_SUBJ_CLR:
+                        if (f->lsm_rule) {
+                                security_task_getsecid(current, &sid);
+                                result = security_audit_rule_match(sid,
+                                                                   f->type,
+                                                                   f->op,
+                                                                   f->lsm_rule,
+                                                                   NULL);
+                        }
                        break;
                }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
                        pid_t                   pid;
                        struct audit_cap_data   cap;
                } capset;
+                struct {
+                        int                     fd;
+                        int                     flags;
+                } mmap;
        };
        int fds[2];
@@ -439,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time.  This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
 static int audit_filter_rules(struct task_struct *tsk,
                              struct audit_krule *rule,
                              struct audit_context *ctx,
                              struct audit_names *name,
-                              enum audit_state *state)
+                              enum audit_state *state,
+                              bool task_creation)
 {
-        const struct cred *cred = get_task_cred(tsk);
+        const struct cred *cred;
        int i, j, need_sid = 1;
        u32 sid;
+        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                int result = 0;
@@ -633,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                }
-                if (!result) {
+                if (!result)
-                        put_cred(cred);
                        return 0;
-                }
        }
        if (ctx) {
@@ -652,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
        }
-        put_cred(cred);
        return 1;
 }
@@ -667,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
-                if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+                if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+                                       &state, true)) {
                        if (state == AUDIT_RECORD_CONTEXT)
                                *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
                        rcu_read_unlock();
@@ -701,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
                list_for_each_entry_rcu(e, list, list) {
                        if ((e->rule.mask[word] & bit) == bit &&
                            audit_filter_rules(tsk, &e->rule, ctx, NULL,
-                                               &state)) {
+                                               &state, false)) {
                                rcu_read_unlock();
                                ctx->current_state = state;
                                return state;
@@ -739,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
                list_for_each_entry_rcu(e, list, list) {
                        if ((e->rule.mask[word] & bit) == bit &&
-                            audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+                            audit_filter_rules(tsk, &e->rule, ctx, n,
+                                               &state, false)) {
                                rcu_read_unlock();
                                ctx->current_state = state;
                                return;
@@ -1007,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 /*
 * to_send and len_sent accounting are very loose estimates.  We aren't
 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
 *
 * why snprintf?  an int is up to 12 digits long.  if we just assumed when
 * logging that a[%d]= was going to be 16 characters long we would be wasting
@@ -1305,6 +1316,10 @@ static void show_special(struct audit_context *context, int *call_panic)
                audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
                audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
                break; }
+        case AUDIT_MMAP: {
+                audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+                                 context->mmap.flags);
+                break; }
        }
        audit_log_end(ab);
 }
@@ -2476,6 +2491,14 @@ void __audit_log_capset(pid_t pid,
        context->type = AUDIT_CAPSET;
 }
+void __audit_mmap_fd(int fd, int flags)
+{
+        struct audit_context *context = current->audit_context;
+        context->mmap.fd = fd;
+        context->mmap.flags = flags;
+        context->type = AUDIT_MMAP;
+}
 /**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
 void foo(void)
 {
        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
        /* End of constants */
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 /*
@@ -21,12 +22,8 @@
 */
 const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
 EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
 int file_caps_enabled = 1;
@@ -290,6 +287,60 @@ error:
 }
 /**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+        int ret = security_real_capable(t, &init_user_ns, cap);
+        return (ret == 0);
+}
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+                       struct user_namespace *ns, int cap)
+{
+        int ret = security_real_capable(t, ns, cap);
+        return (ret == 0);
+}
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+        int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+        return (ret == 0);
+}
+/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
@@ -299,17 +350,60 @@ error:
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
-int capable(int cap)
+bool capable(int cap)
+{
+        return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
 {
        if (unlikely(!cap_valid(cap))) {
                printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
                BUG();
        }
-        if (security_capable(cap) == 0) {
+        if (security_capable(ns, current_cred(), cap) == 0) {
                current->flags |= PF_SUPERPRIV;
-                return 1;
+                return true;
        }
-        return 0;
+        return false;
+}
+EXPORT_SYMBOL(ns_capable);
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ *  Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+        return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
+/**
+ * nsown_capable - Check superior capability to one's own user_ns
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given superior capability
+ * targeted at its own user namespace.
+ */
+bool nsown_capable(int cap)
+{
+        return ns_capable(current_user_ns(), cap);
 }
-EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,12 +52,12 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
-#include <linux/smp_lock.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
 #include <asm/atomic.h>
@@ -138,7 +138,7 @@ struct css_id {
         * is called after synchronize_rcu(). But for safe use, css_is_removed()
         * css_tryget() should be used for avoiding race.
         */
-        struct cgroup_subsys_state *css;
+        struct cgroup_subsys_state __rcu *css;
        /*
         * ID of this css.
         */
@@ -158,7 +158,7 @@ struct css_id {
 };
 /*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
 */
 struct cgroup_event {
        /*
@@ -244,6 +244,11 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
+static int clone_children(const struct cgroup *cgrp)
+{
+        return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
 /*
 * for_each_subsys() allows you to iterate on each subsystem attached to
 * an active hierarchy
@@ -322,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
-static void free_css_set_rcu(struct rcu_head *obj)
-{
-        struct css_set *cg = container_of(obj, struct css_set, rcu_head);
-        kfree(cg);
-}
 /* We don't maintain the lists running through each css_set to its
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
@@ -371,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
        write_unlock(&css_set_lock);
-        call_rcu(&cg->rcu_head, free_css_set_rcu);
+        kfree_rcu(cg, rcu_head);
 }
 /*
@@ -760,6 +759,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -778,6 +778,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -806,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
        return ret;
 }
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
-        struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-        kfree(cgrp);
-}
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -850,11 +844,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 */
                BUG_ON(!list_empty(&cgrp->pidlists));
-                call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+                kfree_rcu(cgrp, rcu_head);
        }
        iput(inode);
 }
+static int cgroup_delete(const struct dentry *d)
+{
+        return 1;
+}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -869,25 +868,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
        struct list_head *node;
        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        node = dentry->d_subdirs.next;
        while (node != &dentry->d_subdirs) {
                struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                list_del_init(node);
                if (d->d_inode) {
                        /* This should never be called on a cgroup
                         * directory with child cgroups */
                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
-                        d = dget_locked(d);
+                        dget_dlock(d);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&d->d_lock);
+                        spin_unlock(&dentry->d_lock);
                        d_delete(d);
                        simple_unlink(dentry->d_inode, d);
                        dput(d);
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
-                }
+                } else
+                        spin_unlock(&d->d_lock);
                node = dentry->d_subdirs.next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -895,11 +898,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
 */
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
+        struct dentry *parent;
        cgroup_clear_directory(dentry);
-        spin_lock(&dcache_lock);
+        parent = dentry->d_parent;
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        list_del_init(&dentry->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
        remove_dir(dentry);
 }
@@ -1040,6 +1048,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noprefix");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+        if (clone_children(&root->top_cgroup))
+                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
        mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1060,7 @@ struct cgroup_sb_opts {
        unsigned long subsys_bits;
        unsigned long flags;
        char *release_agent;
+        bool clone_children;
        char *name;
        /* User explicitly requested empty subsystem */
        bool none;
@@ -1066,7 +1077,8 @@ struct cgroup_sb_opts {
 */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
-        char *token, *o = data ?: "all";
+        char *token, *o = data;
+        bool all_ss = false, one_ss = false;
        unsigned long mask = (unsigned long)-1;
        int i;
        bool module_pin_failed = false;
@@ -1082,22 +1094,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        while ((token = strsep(&o, ",")) != NULL) {
                if (!*token)
                        return -EINVAL;
-                if (!strcmp(token, "all")) {
+                if (!strcmp(token, "none")) {
-                        /* Add all non-disabled subsystems */
-                        opts->subsys_bits = 0;
-                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                                struct cgroup_subsys *ss = subsys[i];
-                                if (ss == NULL)
-                                        continue;
-                                if (!ss->disabled)
-                                        opts->subsys_bits |= 1ul << i;
-                        }
-                } else if (!strcmp(token, "none")) {
                        /* Explicitly have no subsystems */
                        opts->none = true;
-                } else if (!strcmp(token, "noprefix")) {
+                        continue;
+                }
+                if (!strcmp(token, "all")) {
+                        /* Mutually exclusive option 'all' + subsystem name */
+                        if (one_ss)
+                                return -EINVAL;
+                        all_ss = true;
+                        continue;
+                }
+                if (!strcmp(token, "noprefix")) {
                        set_bit(ROOT_NOPREFIX, &opts->flags);
-                } else if (!strncmp(token, "release_agent=", 14)) {
+                        continue;
+                }
+                if (!strcmp(token, "clone_children")) {
+                        opts->clone_children = true;
+                        continue;
+                }
+                if (!strncmp(token, "release_agent=", 14)) {
                        /* Specifying two release agents is forbidden */
                        if (opts->release_agent)
                                return -EINVAL;
@@ -1105,7 +1122,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
                        if (!opts->release_agent)
                                return -ENOMEM;
-                } else if (!strncmp(token, "name=", 5)) {
+                        continue;
+                }
+                if (!strncmp(token, "name=", 5)) {
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1127,20 +1146,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                              GFP_KERNEL);
                        if (!opts->name)
                                return -ENOMEM;
-                } else {
-                        struct cgroup_subsys *ss;
+                        continue;
-                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                }
-                                ss = subsys[i];
-                                if (ss == NULL)
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                                        continue;
+                        struct cgroup_subsys *ss = subsys[i];
-                                if (!strcmp(token, ss->name)) {
+                        if (ss == NULL)
-                                        if (!ss->disabled)
+                                continue;
-                                                set_bit(i, &opts->subsys_bits);
+                        if (strcmp(token, ss->name))
-                                        break;
+                                continue;
-                                }
+                        if (ss->disabled)
-                        }
+                                continue;
-                        if (i == CGROUP_SUBSYS_COUNT)
-                                return -ENOENT;
+                        /* Mutually exclusive option 'all' + subsystem name */
+                        if (all_ss)
+                                return -EINVAL;
+                        set_bit(i, &opts->subsys_bits);
+                        one_ss = true;
+                        break;
+                }
+                if (i == CGROUP_SUBSYS_COUNT)
+                        return -ENOENT;
+        }
+        /*
+         * If the 'all' option was specified select all the subsystems,
+         * otherwise 'all, 'none' and a subsystem name options were not
+         * specified, let's default to 'all'
+         */
+        if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss == NULL)
+                                continue;
+                        if (ss->disabled)
+                                continue;
+                        set_bit(i, &opts->subsys_bits);
                }
        }
@@ -1222,7 +1265,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgroup_sb_opts opts;
-        lock_kernel();
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
@@ -1255,7 +1297,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        kfree(opts.name);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-        unlock_kernel();
        return ret;
 }
@@ -1357,6 +1398,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
                strcpy(root->name, opts->name);
+        if (opts->clone_children)
+                set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
        return root;
 }
@@ -1400,6 +1443,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
 static int cgroup_get_rootdir(struct super_block *sb)
 {
+        static const struct dentry_operations cgroup_dops = {
+                .d_iput = cgroup_diput,
+                .d_delete = cgroup_delete,
+        };
        struct inode *inode =
                cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
        struct dentry *dentry;
@@ -1417,12 +1465,14 @@ static int cgroup_get_rootdir(struct super_block *sb)
                return -ENOMEM;
        }
        sb->s_root = dentry;
+        /* for everything else we want ->d_op set */
+        sb->s_d_op = &cgroup_dops;
        return 0;
 }
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
-                         void *data, struct vfsmount *mnt)
+                         void *data)
 {
        struct cgroup_sb_opts opts;
        struct cgroupfs_root *root;
@@ -1556,10 +1606,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                drop_parsed_module_refcounts(opts.subsys_bits);
        }
-        simple_set_mnt(mnt, sb);
        kfree(opts.release_agent);
        kfree(opts.name);
-        return 0;
+        return dget(sb->s_root);
 drop_new_super:
        deactivate_locked_super(sb);
@@ -1568,8 +1617,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
+        return ERR_PTR(ret);
-        return ret;
 }
 static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1667,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 static struct file_system_type cgroup_fs_type = {
        .name = "cgroup",
-        .get_sb = cgroup_get_sb,
+        .mount = cgroup_mount,
        .kill_sb = cgroup_kill_sb,
 };
@@ -1688,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+                               struct task_struct *tsk, bool guarantee)
+{
+        struct css_set *oldcg;
+        struct css_set *newcg;
+        /*
+         * get old css_set. we need to take task_lock and refcount it, because
+         * an exiting task can change its css_set to init_css_set and drop its
+         * old one without taking cgroup_mutex.
+         */
+        task_lock(tsk);
+        oldcg = tsk->cgroups;
+        get_css_set(oldcg);
+        task_unlock(tsk);
+        /* locate or allocate a new css_set for this task. */
+        if (guarantee) {
+                /* we know the css_set we want already exists. */
+                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+                read_lock(&css_set_lock);
+                newcg = find_existing_css_set(oldcg, cgrp, template);
+                BUG_ON(!newcg);
+                get_css_set(newcg);
+                read_unlock(&css_set_lock);
+        } else {
+                might_sleep();
+                /* find_css_set will give us newcg already referenced. */
+                newcg = find_css_set(oldcg, cgrp);
+                if (!newcg) {
+                        put_css_set(oldcg);
+                        return -ENOMEM;
+                }
+        }
+        put_css_set(oldcg);
+        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                put_css_set(newcg);
+                return -ESRCH;
+        }
+        rcu_assign_pointer(tsk->cgroups, newcg);
+        task_unlock(tsk);
+        /* Update the css_set linked lists if we're using them */
+        write_lock(&css_set_lock);
+        if (!list_empty(&tsk->cg_list))
+                list_move(&tsk->cg_list, &newcg->tasks);
+        write_unlock(&css_set_lock);
+        /*
+         * We just gained a reference on oldcg by taking it from the task. As
+         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+         * it here; it will be freed under RCU.
+         */
+        put_css_set(oldcg);
+        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        return 0;
+}
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
 * @cgrp: the cgroup the task is attaching to
@@ -1698,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval = 0;
+        int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
-        struct css_set *cg;
-        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        /* Nothing to do if the task is already in that cgroup */
@@ -1712,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, tsk, false);
+                        retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1724,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+                if (ss->can_attach_task) {
+                        retval = ss->can_attach_task(cgrp, tsk);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out;
+                        }
+                }
        }
-        task_lock(tsk);
+        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
-        cg = tsk->cgroups;
+        if (retval)
-        get_css_set(cg);
-        task_unlock(tsk);
-        /*
-         * Locate or allocate a new css_set for this task,
-         * based on its final set of cgroups
-         */
-        newcg = find_css_set(cg, cgrp);
-        put_css_set(cg);
-        if (!newcg) {
-                retval = -ENOMEM;
-                goto out;
-        }
-        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                retval = -ESRCH;
                goto out;
-        }
-        rcu_assign_pointer(tsk->cgroups, newcg);
-        task_unlock(tsk);
-        /* Update the css_set linked lists if we're using them */
-        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list)) {
-                list_del(&tsk->cg_list);
-                list_add(&tsk->cg_list, &newcg->tasks);
-        }
-        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+                if (ss->attach_task)
+                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, tsk, false);
+                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
-        put_css_set(cg);
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1784,7 +1881,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, tsk, false);
+                                ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
@@ -1815,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
- * held. May take task_lock of task
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+        struct css_set *cg;
+        struct list_head links;
+};
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                  struct task_struct *tsk, struct css_set *cg,
+                                  struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        read_lock(&css_set_lock);
+        newcg = find_existing_css_set(cg, cgrp, template);
+        if (newcg)
+                get_css_set(newcg);
+        read_unlock(&css_set_lock);
+        /* doesn't exist at all? */
+        if (!newcg)
+                return false;
+        /* see if it's already in the list */
+        list_for_each_entry(cg_entry, newcg_list, links) {
+                if (cg_entry->cg == newcg) {
+                        put_css_set(newcg);
+                        return true;
+                }
+        }
+        /* not found */
+        put_css_set(newcg);
+        return false;
+}
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                            struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        /* ensure a new css_set will exist for this thread */
+        newcg = find_css_set(cg, cgrp);
+        if (!newcg)
+                return -ENOMEM;
+        /* add it to the list */
+        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+        if (!cg_entry) {
+                put_css_set(newcg);
+                return -ENOMEM;
+        }
+        cg_entry->cg = newcg;
+        list_add(&cg_entry->links, newcg_list);
+        return 0;
+}
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
 */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+        int retval, i, group_size;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
+        bool cancel_failed_ss = false;
+        /* guaranteed to be initialized later, but the compiler needs this */
+        struct cgroup *oldcgrp = NULL;
+        struct css_set *oldcg;
+        struct cgroupfs_root *root = cgrp->root;
+        /* threadgroup list cursor and array */
+        struct task_struct *tsk;
+        struct flex_array *group;
+        /*
+         * we need to make sure we have css_sets for all the tasks we're
+         * going to move -before- we actually start moving them, so that in
+         * case we get an ENOMEM we can bail out before making any changes.
+         */
+        struct list_head newcg_list;
+        struct cg_list_entry *cg_entry, *temp_nobe;
+        /*
+         * step 0: in order to do expensive, possibly blocking operations for
+         * every thread, we cannot iterate the thread group list, since it needs
+         * rcu or tasklist locked. instead, build an array of all threads in the
+         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * and if threads exit, this will just be an over-estimate.
+         */
+        group_size = get_nr_threads(leader);
+        /* flex_array supports very large thread-groups better than kmalloc. */
+        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+                                 GFP_KERNEL);
+        if (!group)
+                return -ENOMEM;
+        /* pre-allocate to guarantee space while iterating in rcu read-side. */
+        retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+        if (retval)
+                goto out_free_group_list;
+        /* prevent changes to the threadgroup list while we take a snapshot. */
+        rcu_read_lock();
+        if (!thread_group_leader(leader)) {
+                /*
+                 * a race with de_thread from another thread's exec() may strip
+                 * us of our leadership, making while_each_thread unsafe to use
+                 * on this task. if this happens, there is no choice but to
+                 * throw this task away and try again (from cgroup_procs_write);
+                 * this is "double-double-toil-and-trouble-check locking".
+                 */
+                rcu_read_unlock();
+                retval = -EAGAIN;
+                goto out_free_group_list;
+        }
+        /* take a reference on each task in the group to go in the array. */
+        tsk = leader;
+        i = 0;
+        do {
+                /* as per above, nr_threads may decrease, but not increase. */
+                BUG_ON(i >= group_size);
+                get_task_struct(tsk);
+                /*
+                 * saying GFP_ATOMIC has no effect here because we did prealloc
+                 * earlier, but it's good form to communicate our expectations.
+                 */
+                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+                BUG_ON(retval != 0);
+                i++;
+        } while_each_thread(leader, tsk);
+        /* remember the number of threads in the array for later. */
+        group_size = i;
+        rcu_read_unlock();
+        /*
+         * step 1: check that we can legitimately attach to the cgroup.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->can_attach) {
+                        retval = ss->can_attach(ss, cgrp, leader);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out_cancel_attach;
+                        }
+                }
+                /* a callback to be run on every thread in the threadgroup. */
+                if (ss->can_attach_task) {
+                        /* run on each task in the threadgroup. */
+                        for (i = 0; i < group_size; i++) {
+                                tsk = flex_array_get_ptr(group, i);
+                                retval = ss->can_attach_task(cgrp, tsk);
+                                if (retval) {
+                                        failed_ss = ss;
+                                        cancel_failed_ss = true;
+                                        goto out_cancel_attach;
+                                }
+                        }
+                }
+        }
+        /*
+         * step 2: make sure css_sets exist for all threads to be migrated.
+         * we use find_css_set, which allocates a new one if necessary.
+         */
+        INIT_LIST_HEAD(&newcg_list);
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* nothing to do if this task is already in the cgroup */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* get old css_set pointer */
+                task_lock(tsk);
+                if (tsk->flags & PF_EXITING) {
+                        /* ignore this task if it's going away */
+                        task_unlock(tsk);
+                        continue;
+                }
+                oldcg = tsk->cgroups;
+                get_css_set(oldcg);
+                task_unlock(tsk);
+                /* see if the new one for us is already in the list? */
+                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                        /* was already there, nothing to do. */
+                        put_css_set(oldcg);
+                } else {
+                        /* we don't already have it. get new one. */
+                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                        put_css_set(oldcg);
+                        if (retval)
+                                goto out_list_teardown;
+                }
+        }
+        /*
+         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * one along the way. there are no failure cases after here, so this is
+         * the commit point.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+        }
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* leave current thread as it is if it's already there */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* attach each task to each subsystem */
+                for_each_subsys(root, ss) {
+                        if (ss->attach_task)
+                                ss->attach_task(cgrp, tsk);
+                }
+                /* if the thread is PF_EXITING, it can just get skipped. */
+                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+                BUG_ON(retval != 0 && retval != -ESRCH);
+        }
+        /* nothing is sensitive to fork() after this point. */
+        /*
+         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * TODO: if ever a subsystem needs to know the oldcgrp for each task
+         * being moved, this call will need to be reworked to communicate that.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->attach)
+                        ss->attach(ss, cgrp, oldcgrp, leader);
+        }
+        /*
+         * step 5: success! and cleanup
+         */
+        synchronize_rcu();
+        cgroup_wakeup_rmdir_waiter(cgrp);
+        retval = 0;
+out_list_teardown:
+        /* clean up the list of prefetched css_sets. */
+        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+                list_del(&cg_entry->links);
+                put_css_set(cg_entry->cg);
+                kfree(cg_entry);
+        }
+out_cancel_attach:
+        /* same deal as in cgroup_attach_task */
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss) {
+                                if (cancel_failed_ss && ss->cancel_attach)
+                                        ss->cancel_attach(ss, cgrp, leader);
+                                break;
+                        }
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, leader);
+                }
+        }
+        /* clean up the array of referenced threads in the group. */
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                put_task_struct(tsk);
+        }
+out_free_group_list:
+        flex_array_free(group);
+        return retval;
+}
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk || tsk->flags & PF_EXITING) {
+                if (!tsk) {
+                        rcu_read_unlock();
+                        cgroup_unlock();
+                        return -ESRCH;
+                }
+                if (threadgroup) {
+                        /*
+                         * RCU protects this access, since tsk was found in the
+                         * tid map. a race with de_thread may cause group_leader
+                         * to stop being the leader, but cgroup_attach_proc will
+                         * detect it later.
+                         */
+                        tsk = tsk->group_leader;
+                } else if (tsk->flags & PF_EXITING) {
+                        /* optimization for the single-task-only case */
                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -ESRCH;
                }
+                /*
+                 * even if we're attaching all tasks in the thread group, we
+                 * only need to check permissions on one of them.
+                 */
                tcred = __task_cred(tsk);
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -EACCES;
                }
                get_task_struct(tsk);
                rcu_read_unlock();
        } else {
-                tsk = current;
+                if (threadgroup)
+                        tsk = current->group_leader;
+                else
+                        tsk = current;
                get_task_struct(tsk);
        }
-        ret = cgroup_attach_task(cgrp, tsk);
+        if (threadgroup) {
+                threadgroup_fork_write_lock(tsk);
+                ret = cgroup_attach_proc(cgrp, tsk);
+                threadgroup_fork_write_unlock(tsk);
+        } else {
+                ret = cgroup_attach_task(cgrp, tsk);
+        }
        put_task_struct(tsk);
+        cgroup_unlock();
        return ret;
 }
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
+        return attach_task_by_pid(cgrp, pid, false);
+}
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
        int ret;
-        if (!cgroup_lock_live_group(cgrp))
+        do {
-                return -ENODEV;
+                /*
-        ret = attach_task_by_pid(cgrp, pid);
+                 * attach_proc fails with -EAGAIN if threadgroup leadership
-        cgroup_unlock();
+                 * changes in the middle of the operation, in which case we need
+                 * to find the task_struct for the new leader and start over.
+                 */
+                ret = attach_task_by_pid(cgrp, tgid, true);
+        } while (ret == -EAGAIN);
        return ret;
 }
@@ -1883,6 +2301,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        if (strlen(buffer) >= PATH_MAX)
+                return -EINVAL;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
        strcpy(cgrp->root->release_agent_path, buffer);
@@ -2140,12 +2560,20 @@ static const struct file_operations cgroup_file_operations = {
 };
 static const struct inode_operations cgroup_dir_inode_operations = {
-        .lookup = simple_lookup,
+        .lookup = cgroup_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
 };
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        d_add(dentry, NULL);
+        return NULL;
+}
 /*
 * Check if a file is a control file
 */
@@ -2159,10 +2587,6 @@ static inline struct cftype *__file_cft(struct file *file)
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
-        static const struct dentry_operations cgroup_dops = {
-                .d_iput = cgroup_diput,
-        };
        struct inode *inode;
        if (!dentry)
@@ -2188,7 +2612,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
        }
-        dentry->d_op = &cgroup_dops;
        d_instantiate(dentry, inode);
        dget(dentry);   /* Extra count - pin the dentry in core */
        return 0;
@@ -3176,6 +3599,23 @@ fail:
        return ret;
 }
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+                                    struct cftype *cft)
+{
+        return clone_children(cgrp);
+}
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+                                     struct cftype *cft,
+                                     u64 val)
+{
+        if (val)
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+        else
+                clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+        return 0;
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -3192,9 +3632,9 @@ static struct cftype files[] = {
        {
                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
-                /* .write_u64 = cgroup_procs_write, TODO */
+                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
-                .mode = S_IRUGO,
+                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -3206,6 +3646,11 @@ static struct cftype files[] = {
                .write_string = cgroup_write_event_control,
                .mode = S_IWUGO,
        },
+        {
+                .name = "cgroup.clone_children",
+                .read_u64 = cgroup_clone_children_read,
+                .write_u64 = cgroup_clone_children_write,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -3335,6 +3780,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        if (clone_children(parent))
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
@@ -3349,6 +3797,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                                goto err_destroy;
                }
                /* At error, ->destroy() callback has to free assigned ID. */
+                if (clone_children(parent) && ss->post_clone)
+                        ss->post_clone(ss, cgrp);
        }
        cgroup_lock_hierarchy(root);
@@ -3563,17 +4013,15 @@ again:
        spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
-                list_del(&cgrp->release_list);
+                list_del_init(&cgrp->release_list);
        spin_unlock(&release_list_lock);
        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
-        list_del(&cgrp->sibling);
+        list_del_init(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
-        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
-        spin_unlock(&d->d_lock);
        cgroup_d_remove_dir(d);
        dput(d);
@@ -3789,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        subsys[ss->subsys_id] = NULL;
        /* remove subsystem from rootnode's list of subsystems */
-        list_del(&ss->sibling);
+        list_del_init(&ss->sibling);
        /*
         * disentangle the css from all css_sets attached to the dummytop. as
@@ -4140,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child)
 */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-        int i;
        struct css_set *cg;
+        int i;
-        if (run_callbacks && need_forkexit_callback) {
-                /*
-                 * modular subsystems can't use callbacks, so no need to lock
-                 * the subsys array
-                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-                        struct cgroup_subsys *ss = subsys[i];
-                        if (ss->exit)
-                                ss->exit(ss, tsk);
-                }
-        }
        /*
         * Unlink from the css_set task list if necessary.
@@ -4163,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        if (!list_empty(&tsk->cg_list)) {
                write_lock(&css_set_lock);
                if (!list_empty(&tsk->cg_list))
-                        list_del(&tsk->cg_list);
+                        list_del_init(&tsk->cg_list);
                write_unlock(&css_set_lock);
        }
@@ -4171,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        task_lock(tsk);
        cg = tsk->cgroups;
        tsk->cgroups = &init_css_set;
-        task_unlock(tsk);
-        if (cg)
-                put_css_set_taskexit(cg);
-}
-/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
-                                                        char *nodename)
-{
-        struct dentry *dentry;
-        int ret = 0;
-        struct cgroup *parent, *child;
-        struct inode *inode;
-        struct css_set *cg;
-        struct cgroupfs_root *root;
-        struct cgroup_subsys *ss;
-        /* We shouldn't be called by an unregistered subsystem */
-        BUG_ON(!subsys->active);
-        /* First figure out what hierarchy and cgroup we're dealing
-         * with, and pin them so we can drop cgroup_mutex */
-        mutex_lock(&cgroup_mutex);
- again:
-        root = subsys->root;
-        if (root == &rootnode) {
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Pin the hierarchy */
+        if (run_callbacks && need_forkexit_callback) {
-        if (!atomic_inc_not_zero(&root->sb->s_active)) {
+                /*
-                /* We race with the final deactivate_super() */
+                 * modular subsystems can't use callbacks, so no need to lock
-                mutex_unlock(&cgroup_mutex);
+                 * the subsys array
-                return 0;
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->exit) {
+                                struct cgroup *old_cgrp =
+                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
+                                struct cgroup *cgrp = task_cgroup(tsk, i);
+                                ss->exit(ss, cgrp, old_cgrp, tsk);
+                        }
+                }
        }
-        /* Keep the cgroup alive */
-        task_lock(tsk);
-        parent = task_cgroup(tsk, subsys->subsys_id);
-        cg = tsk->cgroups;
-        get_css_set(cg);
        task_unlock(tsk);
-        mutex_unlock(&cgroup_mutex);
+        if (cg)
+                put_css_set_taskexit(cg);
-        /* Now do the VFS work to create a cgroup */
-        inode = parent->dentry->d_inode;
-        /* Hold the parent directory mutex across this operation to
-         * stop anyone else deleting the new cgroup */
-        mutex_lock(&inode->i_mutex);
-        dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
-        if (IS_ERR(dentry)) {
-                printk(KERN_INFO
-                       "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
-                       PTR_ERR(dentry));
-                ret = PTR_ERR(dentry);
-                goto out_release;
-        }
-        /* Create the cgroup directory, which also creates the cgroup */
-        ret = vfs_mkdir(inode, dentry, 0755);
-        child = __d_cgrp(dentry);
-        dput(dentry);
-        if (ret) {
-                printk(KERN_INFO
-                       "Failed to create cgroup %s: %d\n", nodename,
-                       ret);
-                goto out_release;
-        }
-        /* The cgroup now exists. Retake cgroup_mutex and check
-         * that we're still in the same state that we thought we
-         * were. */
-        mutex_lock(&cgroup_mutex);
-        if ((root != subsys->root) ||
-            (parent != task_cgroup(tsk, subsys->subsys_id))) {
-                /* Aargh, we raced ... */
-                mutex_unlock(&inode->i_mutex);
-                put_css_set(cg);
-                deactivate_super(root->sb);
-                /* The cgroup is still accessible in the VFS, but
-                 * we're not going to try to rmdir() it at this
-                 * point. */
-                printk(KERN_INFO
-                       "Race in cgroup_clone() - leaking cgroup %s\n",
-                       nodename);
-                goto again;
-        }
-        /* do any required auto-setup */
-        for_each_subsys(root, ss) {
-                if (ss->post_clone)
-                        ss->post_clone(ss, child);
-        }
-        /* All seems fine. Finish by moving the task into the new cgroup */
-        ret = cgroup_attach_task(child, tsk);
-        mutex_unlock(&cgroup_mutex);
- out_release:
-        mutex_unlock(&inode->i_mutex);
-        mutex_lock(&cgroup_mutex);
-        put_css_set(cg);
-        mutex_unlock(&cgroup_mutex);
-        deactivate_super(root->sb);
-        return ret;
 }
 /**
@@ -4530,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
        return ret;
 }
-static void __free_css_id_cb(struct rcu_head *head)
-{
-        struct css_id *id;
-        id = container_of(head, struct css_id, rcu_head);
-        kfree(id);
-}
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
        struct css_id *id = css->id;
@@ -4552,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
        spin_unlock(&ss->id_lock);
-        call_rcu(&id->rcu_head, __free_css_id_cb);
+        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4723,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
        return ret;
 }
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+        struct cgroup *cgrp;
+        struct inode *inode;
+        struct cgroup_subsys_state *css;
+        inode = f->f_dentry->d_inode;
+        /* check in cgroup filesystem dir */
+        if (inode->i_op != &cgroup_dir_inode_operations)
+                return ERR_PTR(-EBADF);
+        if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+                return ERR_PTR(-EINVAL);
+        /* get cgroup */
+        cgrp = __d_cgrp(f->f_dentry);
+        css = cgrp->subsys[id];
+        return css ? css : ERR_PTR(-ENOENT);
+}
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
                                                   struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_freezing_or_frozen(struct task_struct *task)
+static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
 {
-        struct freezer *freezer;
+        enum freezer_state state = task_freezer(task)->state;
-        enum freezer_state state;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+}
+int cgroup_freezing_or_frozen(struct task_struct *task)
+{
+        int result;
        task_lock(task);
-        freezer = task_freezer(task);
+        result = __cgroup_freezing_or_frozen(task);
-        if (!freezer->css.cgroup->parent)
-                state = CGROUP_THAWED; /* root cgroup can't be frozen */
-        else
-                state = freezer->state;
        task_unlock(task);
+        return result;
-        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
        kfree(cgroup_freezer(cgroup));
 }
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
-{
-        return frozen(task) ||
-                (task_is_stopped_or_traced(task) && freezing(task));
-}
 /*
 * The call to cgroup_lock() in the freezer.state write method prevents
 * a write to that file racing against an attach, and hence the
@@ -168,37 +160,29 @@ static bool is_task_frozen_enough(struct task_struct *task)
 */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                              struct task_struct *task, bool threadgroup)
+                              struct task_struct *task)
 {
        struct freezer *freezer;
        /*
         * Anything frozen can't move or be moved to/from.
-         *
-         * Since orig_freezer->state == FROZEN means that @task has been
-         * frozen, so it's sufficient to check the latter condition.
         */
-        if (is_task_frozen_enough(task))
-                return -EBUSY;
        freezer = cgroup_freezer(new_cgroup);
-        if (freezer->state == CGROUP_FROZEN)
+        if (freezer->state != CGROUP_THAWED)
                return -EBUSY;
-        if (threadgroup) {
+        return 0;
-                struct task_struct *c;
+}
-                rcu_read_lock();
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+{
-                        if (is_task_frozen_enough(c)) {
+        rcu_read_lock();
-                                rcu_read_unlock();
+        if (__cgroup_freezing_or_frozen(tsk)) {
-                                return -EBUSY;
-                        }
-                }
                rcu_read_unlock();
+                return -EBUSY;
        }
+        rcu_read_unlock();
        return 0;
 }
@@ -236,31 +220,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 /*
 * caller must hold freezer->lock
 */
-static void update_freezer_state(struct cgroup *cgroup,
+static void update_if_frozen(struct cgroup *cgroup,
                                 struct freezer *freezer)
 {
        struct cgroup_iter it;
        struct task_struct *task;
        unsigned int nfrozen = 0, ntotal = 0;
+        enum freezer_state old_state = freezer->state;
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (is_task_frozen_enough(task))
+                if (frozen(task))
                        nfrozen++;
        }
-        /*
+        if (old_state == CGROUP_THAWED) {
-         * Transition to FROZEN when no new tasks can be added ensures
+                BUG_ON(nfrozen > 0);
-         * that we never exist in the FROZEN state while there are unfrozen
+        } else if (old_state == CGROUP_FREEZING) {
-         * tasks.
+                if (nfrozen == ntotal)
-         */
+                        freezer->state = CGROUP_FROZEN;
-        if (nfrozen == ntotal)
+        } else { /* old_state == CGROUP_FROZEN */
-                freezer->state = CGROUP_FROZEN;
+                BUG_ON(nfrozen != ntotal);
-        else if (nfrozen > 0)
+        }
-                freezer->state = CGROUP_FREEZING;
-        else
-                freezer->state = CGROUP_THAWED;
        cgroup_iter_end(cgroup, &it);
 }
@@ -279,7 +262,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
        if (state == CGROUP_FREEZING) {
                /* We change from FREEZING to FROZEN lazily if the cgroup was
                 * only partially frozen when we exitted write. */
-                update_freezer_state(cgroup, freezer);
+                update_if_frozen(cgroup, freezer);
                state = freezer->state;
        }
        spin_unlock_irq(&freezer->lock);
@@ -301,7 +284,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        while ((task = cgroup_iter_next(cgroup, &it))) {
                if (!freeze_task(task, true))
                        continue;
-                if (is_task_frozen_enough(task))
+                if (frozen(task))
                        continue;
                if (!freezing(task) && !freezer_should_skip(task))
                        num_cant_freeze_now++;
@@ -335,7 +318,7 @@ static int freezer_change_state(struct cgroup *cgroup,
        spin_lock_irq(&freezer->lock);
-        update_freezer_state(cgroup, freezer);
+        update_if_frozen(cgroup, freezer);
        if (goal_state == freezer->state)
                goto out;
@@ -398,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
+        .can_attach_task = freezer_can_attach_task,
+        .pre_attach     = NULL,
+        .attach_task    = NULL,
        .attach         = NULL,
        .fork           = freezer_fork,
        .exit           = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
                put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
 }
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+        memset(txc, 0, sizeof(struct timex));
+        if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+                        __get_user(txc->modes, &utp->modes) ||
+                        __get_user(txc->offset, &utp->offset) ||
+                        __get_user(txc->freq, &utp->freq) ||
+                        __get_user(txc->maxerror, &utp->maxerror) ||
+                        __get_user(txc->esterror, &utp->esterror) ||
+                        __get_user(txc->status, &utp->status) ||
+                        __get_user(txc->constant, &utp->constant) ||
+                        __get_user(txc->precision, &utp->precision) ||
+                        __get_user(txc->tolerance, &utp->tolerance) ||
+                        __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+                        __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+                        __get_user(txc->tick, &utp->tick) ||
+                        __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+                        __get_user(txc->jitter, &utp->jitter) ||
+                        __get_user(txc->shift, &utp->shift) ||
+                        __get_user(txc->stabil, &utp->stabil) ||
+                        __get_user(txc->jitcnt, &utp->jitcnt) ||
+                        __get_user(txc->calcnt, &utp->calcnt) ||
+                        __get_user(txc->errcnt, &utp->errcnt) ||
+                        __get_user(txc->stbcnt, &utp->stbcnt))
+                return -EFAULT;
+        return 0;
+}
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+        if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+                        __put_user(txc->modes, &utp->modes) ||
+                        __put_user(txc->offset, &utp->offset) ||
+                        __put_user(txc->freq, &utp->freq) ||
+                        __put_user(txc->maxerror, &utp->maxerror) ||
+                        __put_user(txc->esterror, &utp->esterror) ||
+                        __put_user(txc->status, &utp->status) ||
+                        __put_user(txc->constant, &utp->constant) ||
+                        __put_user(txc->precision, &utp->precision) ||
+                        __put_user(txc->tolerance, &utp->tolerance) ||
+                        __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+                        __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+                        __put_user(txc->tick, &utp->tick) ||
+                        __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+                        __put_user(txc->jitter, &utp->jitter) ||
+                        __put_user(txc->shift, &utp->shift) ||
+                        __put_user(txc->stabil, &utp->stabil) ||
+                        __put_user(txc->jitcnt, &utp->jitcnt) ||
+                        __put_user(txc->calcnt, &utp->calcnt) ||
+                        __put_user(txc->errcnt, &utp->errcnt) ||
+                        __put_user(txc->stbcnt, &utp->stbcnt) ||
+                        __put_user(txc->tai, &utp->tai))
+                return -EFAULT;
+        return 0;
+}
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
                struct timezone __user *tz)
 {
@@ -235,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
        return compat_jiffies_to_clock_t(jiffies);
 }
+#ifdef __ARCH_WANT_SYS_SIGPENDING
 /*
 * Assumption: old_sigset_t and compat_old_sigset_t are both
 * types that can be passed to put_user()/get_user().
@@ -254,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
        return ret;
 }
+#endif
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
 asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
                compat_old_sigset_t __user *oset)
 {
@@ -275,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
        return ret;
 }
+#endif
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
                struct compat_rlimit __user *rlim)
 {
@@ -617,6 +683,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
        return err;
 }
+long compat_sys_clock_adjtime(clockid_t which_clock,
+                struct compat_timex __user *utp)
+{
+        struct timex txc;
+        mm_segment_t oldfs;
+        int err, ret;
+        err = compat_get_timex(&txc, utp);
+        if (err)
+                return err;
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+        set_fs(oldfs);
+        err = compat_put_timex(utp, &txc);
+        if (err)
+                return err;
+        return ret;
+}
 long compat_sys_clock_getres(clockid_t which_clock,
                struct compat_timespec __user *tp)
 {
@@ -809,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 {
        compat_sigset_t s32;
        sigset_t s;
-        int sig;
        struct timespec t;
        siginfo_t info;
-        long ret, timeout = 0;
+        long ret;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
@@ -820,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
        if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
                return -EFAULT;
        sigset_from_compat(&s, &s32);
-        sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
-        signotset(&s);
        if (uts) {
-                if (get_compat_timespec (&t, uts))
+                if (get_compat_timespec(&t, uts))
                        return -EFAULT;
-                if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
-                                || t.tv_sec < 0)
-                        return -EINVAL;
        }
-        spin_lock_irq(&current->sighand->siglock);
+        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-        sig = dequeue_signal(current, &s, &info);
-        if (!sig) {
-                timeout = MAX_SCHEDULE_TIMEOUT;
-                if (uts)
-                        timeout = timespec_to_jiffies(&t)
-                                +(t.tv_sec || t.tv_nsec);
-                if (timeout) {
-                        current->real_blocked = current->blocked;
-                        sigandsets(&current->blocked, &current->blocked, &s);
-                        recalc_sigpending();
-                        spin_unlock_irq(&current->sighand->siglock);
-                        timeout = schedule_timeout_interruptible(timeout);
-                        spin_lock_irq(&current->sighand->siglock);
-                        sig = dequeue_signal(current, &s, &info);
-                        current->blocked = current->real_blocked;
-                        siginitset(&current->real_blocked, 0);
-                        recalc_sigpending();
-                }
-        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (sig) {
+        if (ret > 0 && uinfo) {
-                ret = sig;
+                if (copy_siginfo_to_user32(uinfo, &info))
-                if (uinfo) {
+                        ret = -EFAULT;
-                        if (copy_siginfo_to_user32(uinfo, &info))
-                                ret = -EFAULT;
-                }
-        }else {
-                ret = timeout?-EINTR:-EAGAIN;
        }
        return ret;
 }
@@ -951,58 +1007,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
        struct timex txc;
-        int ret;
+        int err, ret;
-        memset(&txc, 0, sizeof(struct timex));
-        if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+        err = compat_get_timex(&txc, utp);
-                        __get_user(txc.modes, &utp->modes) ||
+        if (err)
-                        __get_user(txc.offset, &utp->offset) ||
+                return err;
-                        __get_user(txc.freq, &utp->freq) ||
-                        __get_user(txc.maxerror, &utp->maxerror) ||
-                        __get_user(txc.esterror, &utp->esterror) ||
-                        __get_user(txc.status, &utp->status) ||
-                        __get_user(txc.constant, &utp->constant) ||
-                        __get_user(txc.precision, &utp->precision) ||
-                        __get_user(txc.tolerance, &utp->tolerance) ||
-                        __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-                        __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-                        __get_user(txc.tick, &utp->tick) ||
-                        __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-                        __get_user(txc.jitter, &utp->jitter) ||
-                        __get_user(txc.shift, &utp->shift) ||
-                        __get_user(txc.stabil, &utp->stabil) ||
-                        __get_user(txc.jitcnt, &utp->jitcnt) ||
-                        __get_user(txc.calcnt, &utp->calcnt) ||
-                        __get_user(txc.errcnt, &utp->errcnt) ||
-                        __get_user(txc.stbcnt, &utp->stbcnt))
-                return -EFAULT;
        ret = do_adjtimex(&txc);
-        if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+        err = compat_put_timex(utp, &txc);
-                        __put_user(txc.modes, &utp->modes) ||
+        if (err)
-                        __put_user(txc.offset, &utp->offset) ||
+                return err;
-                        __put_user(txc.freq, &utp->freq) ||
-                        __put_user(txc.maxerror, &utp->maxerror) ||
-                        __put_user(txc.esterror, &utp->esterror) ||
-                        __put_user(txc.status, &utp->status) ||
-                        __put_user(txc.constant, &utp->constant) ||
-                        __put_user(txc.precision, &utp->precision) ||
-                        __put_user(txc.tolerance, &utp->tolerance) ||
-                        __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-                        __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-                        __put_user(txc.tick, &utp->tick) ||
-                        __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-                        __put_user(txc.jitter, &utp->jitter) ||
-                        __put_user(txc.shift, &utp->shift) ||
-                        __put_user(txc.stabil, &utp->stabil) ||
-                        __put_user(txc.jitcnt, &utp->jitcnt) ||
-                        __put_user(txc.calcnt, &utp->calcnt) ||
-                        __put_user(txc.errcnt, &utp->errcnt) ||
-                        __put_user(txc.stbcnt, &utp->stbcnt) ||
-                        __put_user(txc.tai, &utp->tai))
-                ret = -EFAULT;
        return ret;
 }
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
 static const struct file_operations ikconfig_file_ops = {
        .owner = THIS_MODULE,
        .read = ikconfig_read_current,
+        .llseek = default_llseek,
 };
 static int __init ikconfig_init(void)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
 #else /* #if CONFIG_HOTPLUG_CPU */
 static void cpu_hotplug_begin(void) {}
 static void cpu_hotplug_done(void) {}
-#endif  /* #esle #if CONFIG_HOTPLUG_CPU */
+#endif  /* #else #if CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
        BUG_ON(cpu_notify(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -189,7 +188,6 @@ static inline void check_for_tasks(int cpu)
 }
 struct take_cpu_down_param {
-        struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -198,7 +196,6 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
-        unsigned int cpu = (unsigned long)param->hcpu;
        int err;
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -207,12 +204,6 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
-        if (task_cpu(param->caller) == cpu)
-                move_task_off_dead_cpu(cpu, param->caller);
-        /* Force idle task to run as soon as we yield: it should
-           immediately notice cpu is offline and die quickly. */
-        sched_idle_next();
        return 0;
 }
@@ -223,7 +214,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
-                .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -235,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                return -EINVAL;
        cpu_hotplug_begin();
        err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
        if (err) {
                nr_calls--;
@@ -253,9 +244,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        BUG_ON(cpu_online(cpu));
-        /* Wait for it to sleep (leaving idle task). */
+        /*
+         * The migration_call() CPU_DYING callback will have removed all
+         * runnable tasks from the cpu, there's only the idle task left now
+         * that the migration thread is done doing the stop_machine thing.
+         *
+         * Wait for the stop thread to go away.
+         */
        while (!idle_cpu(cpu))
-                yield();
+                cpu_relax();
        /* This actually kills the CPU. */
        __cpu_die(cpu);
@@ -306,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
        if (ret) {
                nr_calls--;
-                printk("%s: attempt to bring up CPU %u failed\n",
+                printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
                                __func__, cpu);
                goto out_notify;
        }
@@ -386,6 +383,14 @@ out:
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
 int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error = 0;
@@ -397,6 +402,7 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+        arch_disable_nonboot_cpus_begin();
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
@@ -412,6 +418,8 @@ int disable_nonboot_cpus(void)
                }
        }
+        arch_disable_nonboot_cpus_end();
        if (!error) {
                BUG_ON(num_online_cpus() > 1);
                /* Make sure the CPUs won't be enabled by someone else */
@@ -441,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
        if (cpumask_empty(frozen_cpus))
                goto out;
-        printk("Enabling non-boot CPUs ...\n");
+        printk(KERN_INFO "Enabling non-boot CPUs ...\n");
        arch_enable_nonboot_cpus_begin();
        for_each_cpu(cpu, frozen_cpus) {
                error = _cpu_up(cpu, 1);
                if (!error) {
-                        printk("CPU%d is up\n", cpu);
+                        printk(KERN_INFO "CPU%d is up\n", cpu);
                        continue;
                }
                printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -500,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
 */
 /* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x)       [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x)       [x+1][0] = (1UL << (x))
 #define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
 #define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
 #define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
-static int cpuset_get_sb(struct file_system_type *fs_type,
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
-                         int flags, const char *unused_dev_name,
+                         int flags, const char *unused_dev_name, void *data)
-                         void *data, struct vfsmount *mnt)
 {
        struct file_system_type *cgroup_fs = get_fs_type("cgroup");
-        int ret = -ENODEV;
+        struct dentry *ret = ERR_PTR(-ENODEV);
        if (cgroup_fs) {
                char mountopts[] =
                        "cpuset,noprefix,"
                        "release_agent=/sbin/cpuset_release_agent";
-                ret = cgroup_fs->get_sb(cgroup_fs, flags,
+                ret = cgroup_fs->mount(cgroup_fs, flags,
-                                           unused_dev_name, mountopts, mnt);
+                                           unused_dev_name, mountopts);
                put_filesystem(cgroup_fs);
        }
        return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
 static struct file_system_type cpuset_fs_type = {
        .name = "cpuset",
-        .get_sb = cpuset_get_sb,
+        .mount = cpuset_mount,
 };
 /*
@@ -1016,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        static nodemask_t newmems;      /* protected by cgroup_mutex */
-        if (!newmems)
-                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, newmems);
+        guarantee_online_mems(cs, &newmems);
-        cpuset_change_task_nodemask(p, newmems);
+        cpuset_change_task_nodemask(p, &newmems);
-        NODEMASK_FREE(newmems);
        mm = get_task_mm(p);
        if (!mm)
@@ -1165,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
-        if (val < -1 || val >= SD_LV_MAX)
+        if (val < -1 || val >= sched_domain_level_max)
                return -EINVAL;
 #endif
@@ -1373,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                             struct task_struct *tsk, bool threadgroup)
+                             struct task_struct *tsk)
 {
-        int ret;
        struct cpuset *cs = cgroup_cs(cont);
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1397,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
-        ret = security_task_setscheduler(tsk, 0, NULL);
-        if (ret)
-                return ret;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        ret = security_task_setscheduler(c, 0, NULL);
-                        if (ret) {
-                                rcu_read_unlock();
-                                return ret;
-                        }
-                }
-                rcu_read_unlock();
-        }
        return 0;
 }
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-                               struct cpuset *cs)
+{
+        return security_task_setscheduler(task);
+}
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+        struct cpuset *cs = cgroup_cs(cont);
+        if (cs == &top_cpuset)
+                cpumask_copy(cpus_attach, cpu_possible_mask);
+        else
+                guarantee_online_cpus(cs, cpus_attach);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
 {
        int err;
+        struct cpuset *cs = cgroup_cs(cont);
        /*
         * can_attach beforehand should guarantee that this doesn't fail.
         * TODO: have a better way to handle failure here
@@ -1427,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        WARN_ON_ONCE(err);
-        cpuset_change_task_nodemask(tsk, to);
+        cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
        cpuset_update_task_spread_flag(cs, tsk);
 }
 static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                          struct cgroup *oldcont, struct task_struct *tsk,
+                          struct cgroup *oldcont, struct task_struct *tsk)
-                          bool threadgroup)
 {
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
-        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-        if (from == NULL || to == NULL)
-                goto alloc_fail;
-        if (cs == &top_cpuset) {
+        /*
-                cpumask_copy(cpus_attach, cpu_possible_mask);
+         * Change mm, possibly for multiple threads in a threadgroup. This is
-        } else {
+         * expensive and may sleep.
-                guarantee_online_cpus(cs, cpus_attach);
+         */
-        }
+        cpuset_attach_nodemask_from = oldcs->mems_allowed;
-        guarantee_online_mems(cs, to);
+        cpuset_attach_nodemask_to = cs->mems_allowed;
-        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, to, cs);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, to, cs);
-                }
-                rcu_read_unlock();
-        }
-        /* change mm; only needs to be done once even if threadgroup */
-        *from = oldcs->mems_allowed;
-        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, to);
+                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, from, to);
+                        cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+                                          &cpuset_attach_nodemask_to);
                mmput(mm);
        }
-alloc_fail:
-        NODEMASK_FREE(from);
-        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1576,8 +1554,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                return -ENODEV;
        trialcs = alloc_trial_cpuset(cs);
-        if (!trialcs)
+        if (!trialcs) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        switch (cft->private) {
        case FILE_CPULIST:
@@ -1592,6 +1572,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
        }
        free_trial_cpuset(trialcs);
+out:
        cgroup_unlock();
        return retval;
 }
@@ -1608,34 +1589,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 * across a page fault.
 */
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-        int ret;
+        size_t count;
        mutex_lock(&callback_mutex);
-        ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+        count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
        mutex_unlock(&callback_mutex);
-        return ret;
+        return count;
 }
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        size_t count;
-        int retval;
-        if (mask == NULL)
-                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        *mask = cs->mems_allowed;
+        count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
        mutex_unlock(&callback_mutex);
-        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        return count;
-        NODEMASK_FREE(mask);
-        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1829,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 }
 /*
- * post_clone() is called at the end of cgroup_clone().
+ * post_clone() is called during cgroup_create() when the
- * 'cgroup' was just created automatically as a result of
+ * clone_children mount argument was specified.  The cgroup
- * a cgroup_clone(), and the current task is about to
+ * can not yet have any tasks.
- * be moved into 'cgroup'.
 *
 * Currently we refuse to set up the cgroup - thereby
 * refusing the task to be entered, and as a result refusing
@@ -1860,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
        cs = cgroup_cs(cgroup);
        parent_cs = cgroup_cs(parent);
+        mutex_lock(&callback_mutex);
        cs->mems_allowed = parent_cs->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+        mutex_unlock(&callback_mutex);
        return;
 }
@@ -1929,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
        .create = cpuset_create,
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
+        .can_attach_task = cpuset_can_attach_task,
+        .pre_attach = cpuset_pre_attach,
+        .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
@@ -2064,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        static nodemask_t oldmems;      /* protected by cgroup_mutex */
-        if (oldmems == NULL)
-                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2084,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                *oldmems = cp->mems_allowed;
+                oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
@@ -2100,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, oldmems, NULL);
+                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
-        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2145,19 +2118,16 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
-        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        static nodemask_t oldmems;      /* protected by cgroup_mutex */
-        if (oldmems == NULL)
-                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-                *oldmems = top_cpuset.mems_allowed;
+                oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
                break;
        case MEM_OFFLINE:
                /*
@@ -2171,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        }
        cgroup_unlock();
-        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
@@ -2221,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
        rcu_read_lock();
        cs = task_cs(tsk);
        if (cs)
-                cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+                do_set_cpus_allowed(tsk, cs->cpus_allowed);
        rcu_read_unlock();
        /*
@@ -2248,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
                 * Like above we can temporary set any mask and rely on
                 * set_cpus_allowed_ptr() as synchronization point.
                 */
-                cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+                do_set_cpus_allowed(tsk, cpu_possible_mask);
                cpu = cpumask_any(cpu_active_mask);
        }
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+        char *end;
+        if (!arg)
+                return -EINVAL;
+        elfcorehdr_addr = memparse(arg, &end);
+        return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
 static struct thread_group_cred init_tgcred = {
        .usage  = ATOMIC_INIT(2),
        .tgid   = 0,
-        .lock   = SPIN_LOCK_UNLOCKED,
+        .lock   = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
 };
 #endif
@@ -49,11 +49,12 @@ struct cred init_cred = {
        .magic                  = CRED_MAGIC,
 #endif
        .securebits             = SECUREBITS_DEFAULT,
-        .cap_inheritable        = CAP_INIT_INH_SET,
+        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted          = CAP_FULL_SET,
-        .cap_effective          = CAP_INIT_EFF_SET,
+        .cap_effective          = CAP_FULL_SET,
-        .cap_bset               = CAP_INIT_BSET,
+        .cap_bset               = CAP_FULL_SET,
        .user                   = INIT_USER,
+        .user_ns                = &init_user_ns,
        .group_info             = &init_groups,
 #ifdef CONFIG_KEYS
        .tgcred                 = &init_tgcred,
@@ -252,13 +253,13 @@ struct cred *cred_alloc_blank(void)
 #endif
        atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+        new->magic = CRED_MAGIC;
+#endif
        if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
                goto error;
-#ifdef CONFIG_DEBUG_CREDENTIALS
-        new->magic = CRED_MAGIC;
-#endif
        return new;
 error:
@@ -325,7 +326,7 @@ EXPORT_SYMBOL(prepare_creds);
 /*
 * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
 */
 struct cred *prepare_exec_creds(void)
 {
@@ -384,8 +385,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
-        mutex_init(&p->cred_guard_mutex);
        if (
 #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
@@ -412,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                        goto error_put;
        }
+        /* cache user_ns in cred.  Doesn't need a refcount because it will
+         * stay pinned by cred->user
+         */
+        new->user_ns = new->user->user_ns;
 #ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
@@ -659,6 +663,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        validate_creds(old);
        *new = *old;
+        atomic_set(&new->usage, 1);
+        set_cred_subscribers(new, 0);
        get_uid(new->user);
        get_group_info(new->group_info);
@@ -676,8 +682,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
                goto error;
-        atomic_set(&new->usage, 1);
-        set_cred_subscribers(new, 0);
        put_cred(old);
        validate_creds(new);
        return new;
@@ -750,7 +754,11 @@ bool creds_are_invalid(const struct cred *cred)
        if (cred->magic != CRED_MAGIC)
                return true;
 #ifdef CONFIG_SECURITY_SELINUX
-        if (selinux_is_enabled()) {
+        /*
+         * cred->security == NULL if security_cred_alloc_blank() or
+         * security_prepare_creds() returned an error.
+         */
+        if (selinux_is_enabled() && cred->security) {
                if ((unsigned long) cred->security < PAGE_SIZE)
                        return true;
                if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index de407c78178d..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -47,6 +47,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/rcupdate.h>
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
 */
 atomic_t                        kgdb_active = ATOMIC_INIT(-1);
 EXPORT_SYMBOL_GPL(kgdb_active);
+static DEFINE_RAW_SPINLOCK(dbg_master_lock);
+static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
 /*
 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
 * bootup code (which might not have percpu set up yet):
 */
-static atomic_t                 passive_cpu_wait[NR_CPUS];
+static atomic_t                 masters_in_kgdb;
-static atomic_t                 cpu_in_kgdb[NR_CPUS];
+static atomic_t                 slaves_in_kgdb;
 static atomic_t                 kgdb_break_tasklet_var;
 atomic_t                        kgdb_setting_breakpoint;
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
        return 0;
 }
-/**
- *      kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- *      @regs: Current &struct pt_regs.
- *
- *      This function will be called if the particular architecture must
- *      disable hardware debugging while it is processing gdb packets or
- *      handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
 /*
 * Some architectures need cache flushes when we set/clear a
 * breakpoint:
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
        return 1;
 }
-static void dbg_cpu_switch(int cpu, int next_cpu)
+static void dbg_touch_watchdogs(void)
 {
-        /* Mark the cpu we are switching away from as a slave when it
+        touch_softlockup_watchdog_sync();
-         * holds the kgdb_active token.  This must be done so that the
+        clocksource_touch_watchdog();
-         * that all the cpus wait in for the debug core will not enter
+        rcu_cpu_stall_reset();
-         * again as the master. */
-        if (cpu == atomic_read(&kgdb_active)) {
-                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
-                kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
-        }
-        kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
 }
-static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
+                int exception_state)
 {
        unsigned long flags;
        int sstep_tries = 100;
        int error;
-        int i, cpu;
+        int cpu;
        int trace_on = 0;
+        int online_cpus = num_online_cpus();
+        kgdb_info[ks->cpu].enter_kgdb++;
+        kgdb_info[ks->cpu].exception_state |= exception_state;
+        if (exception_state == DCPU_WANT_MASTER)
+                atomic_inc(&masters_in_kgdb);
+        else
+                atomic_inc(&slaves_in_kgdb);
+        if (arch_kgdb_ops.disable_hw_break)
+                arch_kgdb_ops.disable_hw_break(regs);
 acquirelock:
        /*
         * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +488,15 @@ acquirelock:
        kgdb_info[cpu].task = current;
        kgdb_info[cpu].ret_state = 0;
        kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
-        /*
-         * Make sure the above info reaches the primary CPU before
-         * our cpu_in_kgdb[] flag setting does:
-         */
-        atomic_inc(&cpu_in_kgdb[cpu]);
-        if (exception_level == 1)
+        /* Make sure the above info reaches the primary CPU */
+        smp_mb();
+        if (exception_level == 1) {
+                if (raw_spin_trylock(&dbg_master_lock))
+                        atomic_xchg(&kgdb_active, cpu);
                goto cpu_master_loop;
+        }
        /*
         * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +508,12 @@ cpu_loop:
                        kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
                        goto cpu_master_loop;
                } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
-                        if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+                        if (raw_spin_trylock(&dbg_master_lock)) {
+                                atomic_xchg(&kgdb_active, cpu);
                                break;
+                        }
                } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
-                        if (!atomic_read(&passive_cpu_wait[cpu]))
+                        if (!raw_spin_is_locked(&dbg_slave_lock))
                                goto return_normal;
                } else {
 return_normal:
@@ -522,9 +524,12 @@ return_normal:
                                arch_kgdb_ops.correct_hw_break();
                        if (trace_on)
                                tracing_on();
-                        atomic_dec(&cpu_in_kgdb[cpu]);
+                        kgdb_info[cpu].exception_state &=
-                        touch_softlockup_watchdog_sync();
+                                ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
-                        clocksource_touch_watchdog();
+                        kgdb_info[cpu].enter_kgdb--;
+                        smp_mb__before_atomic_dec();
+                        atomic_dec(&slaves_in_kgdb);
+                        dbg_touch_watchdogs();
                        local_irq_restore(flags);
                        return 0;
                }
@@ -533,7 +538,7 @@ return_normal:
        /*
         * For single stepping, try to only enter on the processor
-         * that was single stepping.  To gaurd against a deadlock, the
+         * that was single stepping.  To guard against a deadlock, the
         * kernel will only try for the value of sstep_tries before
         * giving up and continuing on.
         */
@@ -541,8 +546,8 @@ return_normal:
            (kgdb_info[cpu].task &&
             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog_sync();
+                raw_spin_unlock(&dbg_master_lock);
-                clocksource_touch_watchdog();
+                dbg_touch_watchdogs();
                local_irq_restore(flags);
                goto acquirelock;
@@ -563,16 +568,12 @@ return_normal:
        if (dbg_io_ops->pre_exception)
                dbg_io_ops->pre_exception();
-        kgdb_disable_hw_debug(ks->linux_regs);
        /*
         * Get the passive CPU lock which will hold all the non-primary
         * CPU in a spin state while the debugger is active
         */
-        if (!kgdb_single_step) {
+        if (!kgdb_single_step)
-                for (i = 0; i < NR_CPUS; i++)
+                raw_spin_lock(&dbg_slave_lock);
-                        atomic_inc(&passive_cpu_wait[i]);
-        }
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +584,9 @@ return_normal:
        /*
         * Wait for the other CPUs to be notified and be waiting for us:
         */
-        for_each_online_cpu(i) {
+        while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
-                while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
+                                atomic_read(&slaves_in_kgdb)) != online_cpus)
-                        cpu_relax();
+                cpu_relax();
-        }
        /*
         * At this point the primary processor is completely
@@ -615,7 +615,8 @@ cpu_master_loop:
                if (error == DBG_PASS_EVENT) {
                        dbg_kdb_mode = !dbg_kdb_mode;
                } else if (error == DBG_SWITCH_CPU_EVENT) {
-                        dbg_cpu_switch(cpu, dbg_switch_cpu);
+                        kgdb_info[dbg_switch_cpu].exception_state |=
+                                DCPU_NEXT_MASTER;
                        goto cpu_loop;
                } else {
                        kgdb_info[cpu].ret_state = error;
@@ -627,24 +628,11 @@ cpu_master_loop:
        if (dbg_io_ops->post_exception)
                dbg_io_ops->post_exception();
-        atomic_dec(&cpu_in_kgdb[ks->cpu]);
        if (!kgdb_single_step) {
-                for (i = NR_CPUS-1; i >= 0; i--)
+                raw_spin_unlock(&dbg_slave_lock);
-                        atomic_dec(&passive_cpu_wait[i]);
+                /* Wait till all the CPUs have quit from the debugger. */
-                /*
+                while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
-                 * Wait till all the CPUs have quit from the debugger,
+                        cpu_relax();
-                 * but allow a CPU that hit an exception and is
-                 * waiting to become the master to remain in the debug
-                 * core.
-                 */
-                for_each_online_cpu(i) {
-                        while (kgdb_do_roundup &&
-                               atomic_read(&cpu_in_kgdb[i]) &&
-                               !(kgdb_info[i].exception_state &
-                                 DCPU_WANT_MASTER))
-                                cpu_relax();
-                }
        }
 kgdb_restore:
@@ -655,12 +643,20 @@ kgdb_restore:
                else
                        kgdb_sstep_pid = 0;
        }
+        if (arch_kgdb_ops.correct_hw_break)
+                arch_kgdb_ops.correct_hw_break();
        if (trace_on)
                tracing_on();
+        kgdb_info[cpu].exception_state &=
+                ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+        kgdb_info[cpu].enter_kgdb--;
+        smp_mb__before_atomic_dec();
+        atomic_dec(&masters_in_kgdb);
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog_sync();
+        raw_spin_unlock(&dbg_master_lock);
-        clocksource_touch_watchdog();
+        dbg_touch_watchdogs();
        local_irq_restore(flags);
        return kgdb_info[cpu].ret_state;
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
-        int ret;
        ks->cpu                 = raw_smp_processor_id();
        ks->ex_vector           = evector;
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        if (kgdb_reenter_check(ks))
                return 0; /* Ouch, double exception ! */
-        kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+        if (kgdb_info[ks->cpu].enter_kgdb != 0)
-        ret = kgdb_cpu_enter(ks, regs);
+                return 0;
-        kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
-                                                DCPU_IS_SLAVE);
+        return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
-        return ret;
 }
 int kgdb_nmicallback(int cpu, void *regs)
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs)
        ks->cpu                 = cpu;
        ks->linux_regs          = regs;
-        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+        if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
-            atomic_read(&kgdb_active) != -1 &&
+                        raw_spin_is_locked(&dbg_master_lock)) {
-            atomic_read(&kgdb_active) != cpu) {
+                kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
-                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
-                kgdb_cpu_enter(ks, regs);
-                kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
                return 0;
        }
 #endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
        int                     exception_state;
        int                     ret_state;
        int                     irq_depth;
+        int                     enter_kgdb;
 };
 extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
        put_packet(remcom_out_buffer);
        return 0;
 }
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+        unsigned char checksum, ch, buffer[3];
+        int loop;
+        buffer[0] = 'W';
+        buffer[1] = hex_asc_hi(status);
+        buffer[2] = hex_asc_lo(status);
+        dbg_io_ops->write_char('$');
+        checksum = 0;
+        for (loop = 0; loop < 3; loop++) {
+                ch = buffer[loop];
+                checksum += ch;
+                dbg_io_ops->write_char(ch);
+        }
+        dbg_io_ops->write_char('#');
+        dbg_io_ops->write_char(hex_asc_hi(checksum));
+        dbg_io_ops->write_char(hex_asc_lo(checksum));
+        /* make sure the output is flushed, lest the bootloader clobber it */
+        dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
        }
        /* Set initial kdb state variables */
        KDB_STATE_CLEAR(KGDB_TRANS);
-        kdb_initial_cpu = ks->cpu;
+        kdb_initial_cpu = atomic_read(&kgdb_active);
        kdb_current_task = kgdb_info[ks->cpu].task;
        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
        /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
                ks->pass_exception = 1;
                KDB_FLAG_SET(CATASTROPHIC);
        }
-        kdb_initial_cpu = ks->cpu;
        if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
                KDB_STATE_CLEAR(SSBPT);
                KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
        return r;
 }
+EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index caf057a3de0e..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,11 +78,11 @@ static unsigned int kdb_continue_catastrophic;
 static kdbtab_t *kdb_commands;
 #define KDB_BASE_CMD_MAX 50
 static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[50];
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
 #define for_each_kdbcmd(cmd, num)                                       \
        for ((cmd) = kdb_base_commands, (num) = 0;                      \
             num < kdb_max_commands;                                    \
-             num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+             num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
 typedef struct _kdbmsg {
        int     km_diag;        /* kdb diagnostic */
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
 *      symbol name, and offset to the caller.
 *
 *      The argument may consist of a numeric value (decimal or
- *      hexidecimal), a symbol name, a register name (preceeded by the
+ *      hexidecimal), a symbol name, a register name (preceded by the
 *      percent sign), an environment variable with a numeric value
- *      (preceeded by a dollar sign) or a simple arithmetic expression
+ *      (preceded by a dollar sign) or a simple arithmetic expression
 *      consisting of a symbol name, +/-, and a numeric constant value
 *      (offset).
 * Parameters:
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
        }
        if (!s->usable)
                return KDB_NOTIMP;
-        s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+        s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
        if (!s->command) {
                kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
                           cmdstr);
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                /* special case below */
        } else {
                kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
-                           kdb_current, kdb_current->pid);
+                           kdb_current, kdb_current ? kdb_current->pid : 0);
 #if defined(CONFIG_SMP)
                kdb_printf("on processor %d ", raw_smp_processor_id());
 #endif
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
 *      error           The hardware-defined error code
 *      reason2         kdb's current reason code.
 *                      Initially error but can change
- *                      acording to kdb state.
+ *                      according to kdb state.
 *      db_result       Result code from break or debug point.
 *      regs            The exception frame at time of fault/breakpoint.
 *                      should always be valid.
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
        int nextarg;
        long offset;
+        if (raw_smp_processor_id() != kdb_initial_cpu) {
+                kdb_printf("go must execute on the entry cpu, "
+                           "please use \"cpu %d\" and then execute go\n",
+                           kdb_initial_cpu);
+                return KDB_BADCPUNUM;
+        }
        if (argc == 1) {
-                if (raw_smp_processor_id() != kdb_initial_cpu) {
-                        kdb_printf("go <address> must be issued from the "
-                                   "initial cpu, do cpu %d first\n",
-                                   kdb_initial_cpu);
-                        return KDB_ARGCOUNT;
-                }
                nextarg = 1;
                diag = kdbgetaddrarg(argc, argv, &nextarg,
                                     &addr, &offset, NULL);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
 */
 static int kdb_ll(int argc, const char **argv)
 {
-        int diag;
+        int diag = 0;
        unsigned long addr;
        long offset = 0;
        unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
                char buf[80];
                if (KDB_FLAG(CMD_INTERRUPT))
-                        return 0;
+                        goto out;
                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
                diag = kdb_parse(buf);
                if (diag)
-                        return diag;
+                        goto out;
                addr = va + linkoffset;
                if (kdb_getword(&va, addr, sizeof(va)))
-                        return 0;
+                        goto out;
        }
-        kfree(command);
-        return 0;
+out:
+        kfree(command);
+        return diag;
 }
 static int kdb_kgdb(int argc, const char **argv)
@@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv)
 */
 static int kdb_per_cpu(int argc, const char **argv)
 {
-        char buf[256], fmtstr[64];
+        char fmtstr[64];
-        kdb_symtab_t symtab;
+        int cpu, diag, nextarg = 1;
-        cpumask_t suppress = CPU_MASK_NONE;
+        unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
-        int cpu, diag;
-        unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
        if (argc < 1 || argc > 3)
                return KDB_ARGCOUNT;
-        snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
-        if (!kdbgetsymval(buf, &symtab)) {
+        if (diag)
-                kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+                return diag;
-                return KDB_BADADDR;
-        }
        if (argc >= 2) {
                diag = kdbgetularg(argv[2], &bytesperword);
                if (diag)
@@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv)
 #define KDB_PCU(cpu) 0
 #endif
 #endif
        for_each_online_cpu(cpu) {
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
                if (whichcpu != ~0UL && whichcpu != cpu)
                        continue;
-                addr = symtab.sym_start + KDB_PCU(cpu);
+                addr = symaddr + KDB_PCU(cpu);
                diag = kdb_getword(&val, addr, bytesperword);
                if (diag) {
                        kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
                                   "read, diag=%d\n", cpu, addr, diag);
                        continue;
                }
-#ifdef  CONFIG_SMP
-                if (!val) {
-                        cpu_set(cpu, suppress);
-                        continue;
-                }
-#endif  /* CONFIG_SMP */
                kdb_printf("%5d ", cpu);
                kdb_md_line(fmtstr, addr,
                        bytesperword == KDB_WORD_SIZE,
                        1, bytesperword, 1, 1, 0);
        }
-        if (cpus_weight(suppress) == 0)
-                return 0;
-        kdb_printf("Zero suppressed cpu(s):");
-        for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
-             cpu = next_cpu(cpu, suppress)) {
-                kdb_printf(" %d", cpu);
-                if (cpu == num_possible_cpus() - 1 ||
-                    next_cpu(cpu, suppress) != cpu + 1)
-                        continue;
-                while (cpu < num_possible_cpus() &&
-                       next_cpu(cpu, suppress) == cpu + 1)
-                        ++cpu;
-                kdb_printf("-%d", cpu);
-        }
-        kdb_printf("\n");
 #undef KDB_PCU
        return 0;
 }
@@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
                }
                if (kdb_commands) {
                        memcpy(new, kdb_commands,
-                               kdb_max_commands * sizeof(*new));
+                          (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
                        kfree(kdb_commands);
                }
                memset(new + kdb_max_commands, 0,
                       kdb_command_extend * sizeof(*new));
                kdb_commands = new;
-                kp = kdb_commands + kdb_max_commands;
+                kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
                kdb_max_commands += kdb_command_extend;
        }
@@ -2783,6 +2760,8 @@ int kdb_register_repeat(char *cmd,
        return 0;
 }
+EXPORT_SYMBOL_GPL(kdb_register_repeat);
 /*
 * kdb_register - Compatibility register function for commands that do
@@ -2805,6 +2784,7 @@ int kdb_register(char *cmd,
        return kdb_register_repeat(cmd, func, usage, help, minlen,
                                   KDB_REPEAT_NONE);
 }
+EXPORT_SYMBOL_GPL(kdb_register);
 /*
 * kdb_unregister - This function is used to unregister a kernel
@@ -2823,7 +2803,7 @@ int kdb_unregister(char *cmd)
        /*
         *  find the command.
         */
-        for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+        for_each_kdbcmd(kp, i) {
                if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
                        kp->cmd_name = NULL;
                        return 0;
@@ -2833,6 +2813,7 @@ int kdb_unregister(char *cmd)
        /* Couldn't find it.  */
        return 1;
 }
+EXPORT_SYMBOL_GPL(kdb_unregister);
 /* Initialize the kdb command table. */
 static void __init kdb_inittab(void)
@@ -2911,7 +2892,7 @@ static void __init kdb_inittab(void)
          "Send a signal to a process", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("summary", kdb_summary, "",
          "Summarize the system", 4, KDB_REPEAT_NONE);
-        kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+        kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
          "Display per_cpu variables", 3, KDB_REPEAT_NONE);
        kdb_register_repeat("grephelp", kdb_grep_help, "",
          "Display help on | grep", 0, KDB_REPEAT_NONE);
@@ -2933,7 +2914,7 @@ static void __init kdb_cmd_init(void)
        }
 }
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
 void __init kdb_init(int lvl)
 {
        static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index be775f7e81e0..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
 #include <linux/kgdb.h>
 #include "../debug_core.h"
-/* Kernel Debugger Error codes.  Must not overlap with command codes. */
-#define KDB_NOTFOUND    (-1)
-#define KDB_ARGCOUNT    (-2)
-#define KDB_BADWIDTH    (-3)
-#define KDB_BADRADIX    (-4)
-#define KDB_NOTENV      (-5)
-#define KDB_NOENVVALUE  (-6)
-#define KDB_NOTIMP      (-7)
-#define KDB_ENVFULL     (-8)
-#define KDB_ENVBUFFULL  (-9)
-#define KDB_TOOMANYBPT  (-10)
-#define KDB_TOOMANYDBREGS (-11)
-#define KDB_DUPBPT      (-12)
-#define KDB_BPTNOTFOUND (-13)
-#define KDB_BADMODE     (-14)
-#define KDB_BADINT      (-15)
-#define KDB_INVADDRFMT  (-16)
-#define KDB_BADREG      (-17)
-#define KDB_BADCPUNUM   (-18)
-#define KDB_BADLENGTH   (-19)
-#define KDB_NOBP        (-20)
-#define KDB_BADADDR     (-21)
 /* Kernel Debugger Command codes.  Must not overlap with error codes. */
 #define KDB_CMD_GO      (-1001)
 #define KDB_CMD_CPU     (-1002)
@@ -93,17 +70,6 @@
 */
 #define KDB_MAXBPT      16
-/* Maximum number of arguments to a function  */
-#define KDB_MAXARGS    16
-typedef enum {
-        KDB_REPEAT_NONE = 0,    /* Do not repeat this command */
-        KDB_REPEAT_NO_ARGS,     /* Repeat the command without arguments */
-        KDB_REPEAT_WITH_ARGS,   /* Repeat the command including its arguments */
-} kdb_repeat_t;
-typedef int (*kdb_func_t)(int, const char **);
 /* Symbol table format returned by kallsyms. */
 typedef struct __ksymtab {
                unsigned long value;    /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
 extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
 /* Exported Symbols for kernel loadable modules to use. */
-extern int kdb_register(char *, kdb_func_t, char *, char *, short);
-extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
-                               short, kdb_repeat_t);
-extern int kdb_unregister(char *);
 extern int kdb_getarea_size(void *, unsigned long, size_t);
 extern int kdb_putarea_size(unsigned long, void *, size_t);
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
 extern int kdb_putword(unsigned long, unsigned long, size_t);
 extern int kdbgetularg(const char *, unsigned long *);
+extern int kdbgetu64arg(const char *, u64 *);
 extern char *kdbgetenv(const char *);
 extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
                         long *, char **);
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
 extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
 extern void kdb_meminfo_proc_show(void);
-#ifdef CONFIG_KALLSYMS
-extern const char *kdb_walk_kallsyms(loff_t *pos);
-#else /* ! CONFIG_KALLSYMS */
-static inline const char *kdb_walk_kallsyms(loff_t *pos)
-{
-        return NULL;
-}
-#endif /* ! CONFIG_KALLSYMS */
 extern char *kdb_getstr(char *, size_t, char *);
 /* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
 *      Mask for process state.
 * Notes:
 *      The mask folds data from several sources into a single long value, so
- *      be carefull not to overlap the bits.  TASK_* bits are in the LSB,
+ *      be careful not to overlap the bits.  TASK_* bits are in the LSB,
 *      special cases like UNRUNNABLE are in the MSB.  As of 2.6.10-rc1 there
 *      is no overlap between TASK_* and EXIT_* but that may not always be
 *      true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * early_res, could be used to replace bootmem
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/early_res.h>
-#include <linux/slab.h>
-#include <linux/kmemleak.h>
-/*
- * Early reserved memory areas.
- */
-/*
- * need to make sure this one is bigger enough before
- * find_fw_memmap_area could be used
- */
-#define MAX_EARLY_RES_X 32
-struct early_res {
-        u64 start, end;
-        char name[15];
-        char overlap_ok;
-};
-static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
-static int max_early_res __initdata = MAX_EARLY_RES_X;
-static struct early_res *early_res __initdata = &early_res_x[0];
-static int early_res_count __initdata;
-static int __init find_overlapped_early(u64 start, u64 end)
-{
-        int i;
-        struct early_res *r;
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                r = &early_res[i];
-                if (end > r->start && start < r->end)
-                        break;
-        }
-        return i;
-}
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
-        int j;
-        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
-                ;
-        memmove(&early_res[i], &early_res[i + 1],
-               (j - 1 - i) * sizeof(struct early_res));
-        early_res[j - 1].end = 0;
-        early_res_count--;
-}
-static void __init drop_range_partial(int i, u64 start, u64 end)
-{
-        u64 common_start, common_end;
-        u64 old_start, old_end;
-        old_start = early_res[i].start;
-        old_end = early_res[i].end;
-        common_start = max(old_start, start);
-        common_end = min(old_end, end);
-        /* no overlap ? */
-        if (common_start >= common_end)
-                return;
-        if (old_start < common_start) {
-                /* make head segment */
-                early_res[i].end = common_start;
-                if (old_end > common_end) {
-                        char name[15];
-                        /*
-                         * Save a local copy of the name, since the
-                         * early_res array could get resized inside
-                         * reserve_early_without_check() ->
-                         * __check_and_double_early_res(), which would
-                         * make the current name pointer invalid.
-                         */
-                        strncpy(name, early_res[i].name,
-                                         sizeof(early_res[i].name) - 1);
-                        /* add another for left over on tail */
-                        reserve_early_without_check(common_end, old_end, name);
-                }
-                return;
-        } else {
-                if (old_end > common_end) {
-                        /* reuse the entry for tail left */
-                        early_res[i].start = common_end;
-                        return;
-                }
-                /* all covered */
-                drop_range(i);
-        }
-}
-/*
- * Split any existing ranges that:
- *  1) are marked 'overlap_ok', and
- *  2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range.  Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
- */
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
-{
-        int i;
-        struct early_res *r;
-        u64 lower_start, lower_end;
-        u64 upper_start, upper_end;
-        char name[15];
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                r = &early_res[i];
-                /* Continue past non-overlapping ranges */
-                if (end <= r->start || start >= r->end)
-                        continue;
-                /*
-                 * Leave non-ok overlaps as is; let caller
-                 * panic "Overlapping early reservations"
-                 * when it hits this overlap.
-                 */
-                if (!r->overlap_ok)
-                        return;
-                /*
-                 * We have an ok overlap.  We will drop it from the early
-                 * reservation map, and add back in any non-overlapping
-                 * portions (lower or upper) as separate, overlap_ok,
-                 * non-overlapping ranges.
-                 */
-                /* 1. Note any non-overlapping (lower or upper) ranges. */
-                strncpy(name, r->name, sizeof(name) - 1);
-                lower_start = lower_end = 0;
-                upper_start = upper_end = 0;
-                if (r->start < start) {
-                        lower_start = r->start;
-                        lower_end = start;
-                }
-                if (r->end > end) {
-                        upper_start = end;
-                        upper_end = r->end;
-                }
-                /* 2. Drop the original ok overlapping range */
-                drop_range(i);
-                i--;            /* resume for-loop on copied down entry */
-                /* 3. Add back in any non-overlapping ranges. */
-                if (lower_end)
-                        reserve_early_overlap_ok(lower_start, lower_end, name);
-                if (upper_end)
-                        reserve_early_overlap_ok(upper_start, upper_end, name);
-        }
-}
-static void __init __reserve_early(u64 start, u64 end, char *name,
-                                                int overlap_ok)
-{
-        int i;
-        struct early_res *r;
-        i = find_overlapped_early(start, end);
-        if (i >= max_early_res)
-                panic("Too many early reservations");
-        r = &early_res[i];
-        if (r->end)
-                panic("Overlapping early reservations "
-                      "%llx-%llx %s to %llx-%llx %s\n",
-                      start, end - 1, name ? name : "", r->start,
-                      r->end - 1, r->name);
-        r->start = start;
-        r->end = end;
-        r->overlap_ok = overlap_ok;
-        if (name)
-                strncpy(r->name, name, sizeof(r->name) - 1);
-        early_res_count++;
-}
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first.  We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
-        drop_overlaps_that_are_ok(start, end);
-        __reserve_early(start, end, name, 1);
-}
-static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
-{
-        u64 start, end, size, mem;
-        struct early_res *new;
-        /* do we have enough slots left ? */
-        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
-                return;
-        /* double it */
-        mem = -1ULL;
-        size = sizeof(struct early_res) * max_early_res * 2;
-        if (early_res == early_res_x)
-                start = 0;
-        else
-                start = early_res[0].end;
-        end = ex_start;
-        if (start + size < end)
-                mem = find_fw_memmap_area(start, end, size,
-                                         sizeof(struct early_res));
-        if (mem == -1ULL) {
-                start = ex_end;
-                end = get_max_mapped();
-                if (start + size < end)
-                        mem = find_fw_memmap_area(start, end, size,
-                                                 sizeof(struct early_res));
-        }
-        if (mem == -1ULL)
-                panic("can not find more space for early_res array");
-        new = __va(mem);
-        /* save the first one for own */
-        new[0].start = mem;
-        new[0].end = mem + size;
-        new[0].overlap_ok = 0;
-        /* copy old to new */
-        if (early_res == early_res_x) {
-                memcpy(&new[1], &early_res[0],
-                         sizeof(struct early_res) * max_early_res);
-                memset(&new[max_early_res+1], 0,
-                         sizeof(struct early_res) * (max_early_res - 1));
-                early_res_count++;
-        } else {
-                memcpy(&new[1], &early_res[1],
-                         sizeof(struct early_res) * (max_early_res - 1));
-                memset(&new[max_early_res], 0,
-                         sizeof(struct early_res) * max_early_res);
-        }
-        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
-        early_res = new;
-        max_early_res *= 2;
-        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
-                max_early_res, mem, mem + size - 1);
-}
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
-        if (start >= end)
-                return;
-        __check_and_double_early_res(start, end);
-        drop_overlaps_that_are_ok(start, end);
-        __reserve_early(start, end, name, 0);
-}
-void __init reserve_early_without_check(u64 start, u64 end, char *name)
-{
-        struct early_res *r;
-        if (start >= end)
-                return;
-        __check_and_double_early_res(start, end);
-        r = &early_res[early_res_count];
-        r->start = start;
-        r->end = end;
-        r->overlap_ok = 0;
-        if (name)
-                strncpy(r->name, name, sizeof(r->name) - 1);
-        early_res_count++;
-}
-void __init free_early(u64 start, u64 end)
-{
-        struct early_res *r;
-        int i;
-        kmemleak_free_part(__va(start), end - start);
-        i = find_overlapped_early(start, end);
-        r = &early_res[i];
-        if (i >= max_early_res || r->end != end || r->start != start)
-                panic("free_early on not reserved area: %llx-%llx!",
-                         start, end - 1);
-        drop_range(i);
-}
-void __init free_early_partial(u64 start, u64 end)
-{
-        struct early_res *r;
-        int i;
-        kmemleak_free_part(__va(start), end - start);
-        if (start == end)
-                return;
-        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
-                return;
-try_next:
-        i = find_overlapped_early(start, end);
-        if (i >= max_early_res)
-                return;
-        r = &early_res[i];
-        /* hole ? */
-        if (r->end >= end && r->start <= start) {
-                drop_range_partial(i, start, end);
-                return;
-        }
-        drop_range_partial(i, start, end);
-        goto try_next;
-}
-#ifdef CONFIG_NO_BOOTMEM
-static void __init subtract_early_res(struct range *range, int az)
-{
-        int i, count;
-        u64 final_start, final_end;
-        int idx = 0;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        /* need to skip first one ?*/
-        if (early_res != early_res_x)
-                idx = 1;
-#define DEBUG_PRINT_EARLY_RES 1
-#if DEBUG_PRINT_EARLY_RES
-        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
-#endif
-        for (i = idx; i < count; i++) {
-                struct early_res *r = &early_res[i];
-#if DEBUG_PRINT_EARLY_RES
-                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
-                        r->start, r->end, r->name);
-#endif
-                final_start = PFN_DOWN(r->start);
-                final_end = PFN_UP(r->end);
-                if (final_start >= final_end)
-                        continue;
-                subtract_range(range, az, final_start, final_end);
-        }
-}
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
-        int i, count;
-        u64 start = 0, end;
-        u64 size;
-        u64 mem;
-        struct range *range;
-        int nr_range;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        count *= 2;
-        size = sizeof(struct range) * count;
-        end = get_max_mapped();
-#ifdef MAX_DMA32_PFN
-        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
-                start = MAX_DMA32_PFN << PAGE_SHIFT;
-#endif
-        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
-        if (mem == -1ULL)
-                panic("can not find more space for range free");
-        range = __va(mem);
-        /* use early_node_map[] and early_res to get range array at first */
-        memset(range, 0, size);
-        nr_range = 0;
-        /* need to go over early_node_map to find out good range for node */
-        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
-#ifdef CONFIG_X86_32
-        subtract_range(range, count, max_low_pfn, -1ULL);
-#endif
-        subtract_early_res(range, count);
-        nr_range = clean_sort_range(range, count);
-        /* need to clear it ? */
-        if (nodeid == MAX_NUMNODES) {
-                memset(&early_res[0], 0,
-                         sizeof(struct early_res) * max_early_res);
-                early_res = NULL;
-                max_early_res = 0;
-        }
-        *rangep = range;
-        return nr_range;
-}
-#else
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
-        int i, count;
-        u64 final_start, final_end;
-        int idx = 0;
-        count  = 0;
-        for (i = 0; i < max_early_res && early_res[i].end; i++)
-                count++;
-        /* need to skip first one ?*/
-        if (early_res != early_res_x)
-                idx = 1;
-        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
-                         count - idx, max_early_res, start, end);
-        for (i = idx; i < count; i++) {
-                struct early_res *r = &early_res[i];
-                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
-                        r->start, r->end, r->name);
-                final_start = max(start, r->start);
-                final_end = min(end, r->end);
-                if (final_start >= final_end) {
-                        printk(KERN_CONT "\n");
-                        continue;
-                }
-                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
-                        final_start, final_end);
-                reserve_bootmem_generic(final_start, final_end - final_start,
-                                BOOTMEM_DEFAULT);
-        }
-        /* clear them */
-        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
-        early_res = NULL;
-        max_early_res = 0;
-        early_res_count = 0;
-}
-#endif
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
-        int i;
-        u64 addr = *addrp;
-        int changed = 0;
-        struct early_res *r;
-again:
-        i = find_overlapped_early(addr, addr + size);
-        r = &early_res[i];
-        if (i < max_early_res && r->end) {
-                *addrp = addr = round_up(r->end, align);
-                changed = 1;
-                goto again;
-        }
-        return changed;
-}
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
-{
-        int i;
-        u64 addr = *addrp, last;
-        u64 size = *sizep;
-        int changed = 0;
-again:
-        last = addr + size;
-        for (i = 0; i < max_early_res && early_res[i].end; i++) {
-                struct early_res *r = &early_res[i];
-                if (last > r->start && addr < r->start) {
-                        size = r->start - addr;
-                        changed = 1;
-                        goto again;
-                }
-                if (last > r->end && addr < r->end) {
-                        addr = round_up(r->end, align);
-                        size = last - addr;
-                        changed = 1;
-                        goto again;
-                }
-                if (last <= r->end && addr >= r->start) {
-                        (*sizep)++;
-                        return 0;
-                }
-        }
-        if (changed) {
-                *addrp = addr;
-                *sizep = size;
-        }
-        return changed;
-}
-/*
- * Find a free area with specified alignment in a specific range.
- * only with the area.between start to end is active range from early_node_map
- * so they are good as RAM
- */
-u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
-                         u64 size, u64 align)
-{
-        u64 addr, last;
-        addr = round_up(ei_start, align);
-        if (addr < start)
-                addr = round_up(start, align);
-        if (addr >= ei_last)
-                goto out;
-        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
-                ;
-        last = addr + size;
-        if (last > ei_last)
-                goto out;
-        if (last > end)
-                goto out;
-        return addr;
-out:
-        return -1ULL;
-}
-u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
-                         u64 *sizep, u64 align)
-{
-        u64 addr, last;
-        addr = round_up(ei_start, align);
-        if (addr < start)
-                addr = round_up(start, align);
-        if (addr >= ei_last)
-                goto out;
-        *sizep = ei_last - addr;
-        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
-                ;
-        last = addr + *sizep;
-        if (last > ei_last)
-                goto out;
-        return addr;
-out:
-        return -1ULL;
-}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+obj-y := core.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index b98bed3d8182..9efe7108ccaf 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
@@ -21,7 +22,9 @@
 #include <linux/dcache.h>
 #include <linux/percpu.h>
 #include <linux/ptrace.h>
+#include <linux/reboot.h>
 #include <linux/vmstat.h>
+#include <linux/device.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -35,20 +38,104 @@
 #include <asm/irq_regs.h>
-/*
+struct remote_function_call {
- * Each CPU has a list of per CPU events:
+        struct task_struct      *p;
+        int                     (*func)(void *info);
+        void                    *info;
+        int                     ret;
+};
+static void remote_function(void *data)
+{
+        struct remote_function_call *tfc = data;
+        struct task_struct *p = tfc->p;
+        if (p) {
+                tfc->ret = -EAGAIN;
+                if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+                        return;
+        }
+        tfc->ret = tfc->func(tfc->info);
+}
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p:          the task to evaluate
+ * @func:       the function to be called
+ * @info:       the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ *          -ESRCH  - when the process isn't running
+ *          -EAGAIN - when the process moved away
 */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+        struct remote_function_call data = {
+                .p      = p,
+                .func   = func,
+                .info   = info,
+                .ret    = -ESRCH, /* No such (running) process */
+        };
-int perf_max_events __read_mostly = 1;
+        if (task_curr(p))
-static int perf_reserved_percpu __read_mostly;
+                smp_call_function_single(task_cpu(p), remote_function, &data, 1);
-static int perf_overcommit __read_mostly = 1;
+        return data.ret;
+}
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func:       the function to be called
+ * @info:       the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+        struct remote_function_call data = {
+                .p      = NULL,
+                .func   = func,
+                .info   = info,
+                .ret    = -ENXIO, /* No such CPU */
+        };
+        smp_call_function_single(cpu, remote_function, &data, 1);
+        return data.ret;
+}
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+                       PERF_FLAG_FD_OUTPUT  |\
+                       PERF_FLAG_PID_CGROUP)
+enum event_type_t {
+        EVENT_FLEXIBLE = 0x1,
+        EVENT_PINNED = 0x2,
+        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+struct jump_label_key perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static atomic_t nr_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
 /*
 * perf event paranoia level:
 *  -1 - not paranoid at all
@@ -58,58 +145,445 @@ static atomic_t nr_task_events __read_mostly;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 /*
 * max perf event sample rate
 */
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+int perf_proc_update_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
+        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+        return 0;
+}
 static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              enum event_type_t event_type);
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+                             enum event_type_t event_type,
+                             struct task_struct *task);
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
+void __weak perf_event_print_debug(void)        { }
+extern __weak const char *perf_pmu_name(void)
+{
+        return "pmu";
+}
+static inline u64 perf_clock(void)
+{
+        return local_clock();
+}
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+#ifdef CONFIG_CGROUP_PERF
 /*
- * Lock for (sysadmin-configurable) event reservations:
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
 */
-static DEFINE_SPINLOCK(perf_resource_lock);
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+        return container_of(task_subsys_state(task, perf_subsys_id),
+                        struct perf_cgroup, css);
+}
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+        css_get(&event->cgrp->css);
+}
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+        css_put(&event->cgrp->css);
+}
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+        perf_put_cgroup(event);
+        event->cgrp = NULL;
+}
+static inline int is_cgroup_event(struct perf_event *event)
+{
+        return event->cgrp != NULL;
+}
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+        struct perf_cgroup_info *t;
+        t = per_cpu_ptr(event->cgrp->info, event->cpu);
+        return t->time;
+}
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+        struct perf_cgroup_info *info;
+        u64 now;
+        now = perf_clock();
+        info = this_cpu_ptr(cgrp->info);
+        info->time += now - info->timestamp;
+        info->timestamp = now;
+}
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+        struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+        if (cgrp_out)
+                __update_cgrp_time(cgrp_out);
+}
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+        struct perf_cgroup *cgrp;
+        /*
+         * ensure we access cgroup data only when needed and
+         * when we know the cgroup is pinned (css_get)
+         */
+        if (!is_cgroup_event(event))
+                return;
+        cgrp = perf_cgroup_from_task(current);
+        /*
+         * Do not update time when cgroup is not active
+         */
+        if (cgrp == event->cgrp)
+                __update_cgrp_time(event->cgrp);
+}
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+                          struct perf_event_context *ctx)
+{
+        struct perf_cgroup *cgrp;
+        struct perf_cgroup_info *info;
+        /*
+         * ctx->lock held by caller
+         * ensure we do not access cgroup data
+         * unless we have the cgroup pinned (css_get)
+         */
+        if (!task || !ctx->nr_cgroups)
+                return;
+        cgrp = perf_cgroup_from_task(task);
+        info = this_cpu_ptr(cgrp->info);
+        info->timestamp = ctx->timestamp;
+}
+#define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 /*
- * Architecture provided APIs - weak aliases:
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
 */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+void perf_cgroup_switch(struct task_struct *task, int mode)
 {
-        return NULL;
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        /*
+         * disable interrupts to avoid geting nr_cgroup
+         * changes via __perf_event_disable(). Also
+         * avoids preemption.
+         */
+        local_irq_save(flags);
+        /*
+         * we reschedule only in the presence of cgroup
+         * constrained events.
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                perf_pmu_disable(cpuctx->ctx.pmu);
+                /*
+                 * perf_cgroup_events says at least one
+                 * context on this CPU has cgroup events.
+                 *
+                 * ctx->nr_cgroups reports the number of cgroup
+                 * events for a context.
+                 */
+                if (cpuctx->ctx.nr_cgroups > 0) {
+                        if (mode & PERF_CGROUP_SWOUT) {
+                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                                /*
+                                 * must not be done before ctxswout due
+                                 * to event_filter_match() in event_sched_out()
+                                 */
+                                cpuctx->cgrp = NULL;
+                        }
+                        if (mode & PERF_CGROUP_SWIN) {
+                                WARN_ON_ONCE(cpuctx->cgrp);
+                                /* set cgrp before ctxsw in to
+                                 * allow event_filter_match() to not
+                                 * have to pass task around
+                                 */
+                                cpuctx->cgrp = perf_cgroup_from_task(task);
+                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+                        }
+                }
+                perf_pmu_enable(cpuctx->ctx.pmu);
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
 }
-void __weak hw_perf_disable(void)               { barrier(); }
+static inline void perf_cgroup_sched_out(struct task_struct *task)
-void __weak hw_perf_enable(void)                { barrier(); }
+{
+        perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
-void __weak perf_event_print_debug(void)        { }
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+        perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+                                      struct perf_event_attr *attr,
+                                      struct perf_event *group_leader)
+{
+        struct perf_cgroup *cgrp;
+        struct cgroup_subsys_state *css;
+        struct file *file;
+        int ret = 0, fput_needed;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        css = cgroup_css_from_dir(file, perf_subsys_id);
+        if (IS_ERR(css)) {
+                ret = PTR_ERR(css);
+                goto out;
+        }
+        cgrp = container_of(css, struct perf_cgroup, css);
+        event->cgrp = cgrp;
+        /* must be done before we fput() the file */
+        perf_get_cgroup(event);
-static DEFINE_PER_CPU(int, perf_disable_count);
+        /*
+         * all events in a group must monitor
+         * the same cgroup because a task belongs
+         * to only one perf cgroup at a time
+         */
+        if (group_leader && group_leader->cgrp != cgrp) {
+                perf_detach_cgroup(event);
+                ret = -EINVAL;
+        }
+out:
+        fput_light(file, fput_needed);
+        return ret;
+}
-void perf_disable(void)
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 {
-        if (!__get_cpu_var(perf_disable_count)++)
+        struct perf_cgroup_info *t;
-                hw_perf_disable();
+        t = per_cpu_ptr(event->cgrp->info, event->cpu);
+        event->shadow_ctx_time = now - t->timestamp;
 }
-void perf_enable(void)
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
 {
-        if (!--__get_cpu_var(perf_disable_count))
+        /*
-                hw_perf_enable();
+         * when the current task's perf cgroup does not match
+         * the event's, we need to remember to call the
+         * perf_mark_enable() function the first time a task with
+         * a matching perf cgroup is scheduled in.
+         */
+        if (is_cgroup_event(event) && !perf_cgroup_match(event))
+                event->cgrp_defer_enabled = 1;
 }
-static void get_ctx(struct perf_event_context *ctx)
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+                         struct perf_event_context *ctx)
 {
-        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+        struct perf_event *sub;
+        u64 tstamp = perf_event_time(event);
+        if (!event->cgrp_defer_enabled)
+                return;
+        event->cgrp_defer_enabled = 0;
+        event->tstamp_enabled = tstamp - event->total_time_enabled;
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                        sub->cgrp_defer_enabled = 0;
+                }
+        }
 }
+#else /* !CONFIG_CGROUP_PERF */
-static void free_ctx(struct rcu_head *head)
+static inline bool
+perf_cgroup_match(struct perf_event *event)
 {
-        struct perf_event_context *ctx;
+        return true;
+}
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+static inline int is_cgroup_event(struct perf_event *event)
+{
+        return 0;
+}
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+        return 0;
+}
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
-        ctx = container_of(head, struct perf_event_context, rcu_head);
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
-        kfree(ctx);
+{
+}
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+                                      struct perf_event_attr *attr,
+                                      struct perf_event *group_leader)
+{
+        return -EINVAL;
+}
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+                          struct perf_event_context *ctx)
+{
+}
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+        return 0;
+}
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+                         struct perf_event_context *ctx)
+{
+}
+#endif
+void perf_pmu_disable(struct pmu *pmu)
+{
+        int *count = this_cpu_ptr(pmu->pmu_disable_count);
+        if (!(*count)++)
+                pmu->pmu_disable(pmu);
+}
+void perf_pmu_enable(struct pmu *pmu)
+{
+        int *count = this_cpu_ptr(pmu->pmu_disable_count);
+        if (!--(*count))
+                pmu->pmu_enable(pmu);
+}
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
+{
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+        struct list_head *head = &__get_cpu_var(rotation_list);
+        WARN_ON(!irqs_disabled());
+        if (list_empty(&cpuctx->rotation_list))
+                list_add(&cpuctx->rotation_list, head);
+}
+static void get_ctx(struct perf_event_context *ctx)
+{
+        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 static void put_ctx(struct perf_event_context *ctx)
@@ -119,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task)
                        put_task_struct(ctx->task);
-                call_rcu(&ctx->rcu_head, free_ctx);
+                kfree_rcu(ctx, rcu_head);
        }
 }
@@ -131,6 +605,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
        }
 }
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+        /*
+         * only top level events have the pid namespace they were created in
+         */
+        if (event->parent)
+                event = event->parent;
+        return task_tgid_nr_ns(p, event->ns);
+}
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+        /*
+         * only top level events have the pid namespace they were created in
+         */
+        if (event->parent)
+                event = event->parent;
+        return task_pid_nr_ns(p, event->ns);
+}
 /*
 * If we inherit events we want to return the parent event id
 * to userspace.
@@ -151,13 +647,13 @@ static u64 primary_event_id(struct perf_event *event)
 * the context could get moved to another task.
 */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
        struct perf_event_context *ctx;
        rcu_read_lock();
- retry:
+retry:
-        ctx = rcu_dereference(task->perf_event_ctxp);
+        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
@@ -170,7 +666,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                 * can't get swapped on us any more.
                 */
                raw_spin_lock_irqsave(&ctx->lock, *flags);
-                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        goto retry;
                }
@@ -189,12 +685,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
 {
        struct perf_event_context *ctx;
        unsigned long flags;
-        ctx = perf_lock_task_context(task, &flags);
+        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -209,12 +706,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
-        put_ctx(ctx);
-}
-static inline u64 perf_clock(void)
-{
-        return local_clock();
 }
 /*
@@ -228,6 +719,16 @@ static void update_context_time(struct perf_event_context *ctx)
        ctx->timestamp = now;
 }
+static u64 perf_event_time(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        if (is_cgroup_event(event))
+                return perf_cgroup_event_time(event);
+        return ctx ? ctx->time : 0;
+}
 /*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
@@ -239,8 +740,19 @@ static void update_event_times(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE ||
            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
                return;
+        /*
-        if (ctx->is_active)
+         * in cgroup mode, time_enabled represents
+         * the time the event was enabled AND active
+         * tasks were in the monitored cgroup. This is
+         * independent of the activity of the context as
+         * there may be a mix of cgroup and non-cgroup events.
+         *
+         * That is why we treat cgroup events differently
+         * here.
+         */
+        if (is_cgroup_event(event))
+                run_end = perf_event_time(event);
+        else if (ctx->is_active)
                run_end = ctx->time;
        else
                run_end = event->tstamp_stopped;
@@ -250,9 +762,10 @@ static void update_event_times(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_INACTIVE)
                run_end = event->tstamp_stopped;
        else
-                run_end = ctx->time;
+                run_end = perf_event_time(event);
        event->total_time_running = run_end - event->tstamp_running;
 }
 /*
@@ -301,17 +814,102 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                list_add_tail(&event->group_entry, list);
        }
+        if (is_cgroup_event(event))
+                ctx->nr_cgroups++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
+        if (!ctx->nr_events)
+                perf_pmu_rotate_start(ctx->pmu);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
 }
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+        int entry = sizeof(u64); /* value */
+        int size = 0;
+        int nr = 1;
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_ID)
+                entry += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_GROUP) {
+                nr += event->group_leader->nr_siblings;
+                size += sizeof(u64);
+        }
+        size += entry * nr;
+        event->read_size = size;
+}
+static void perf_event__header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        perf_event__read_size(event);
+        if (sample_type & PERF_SAMPLE_IP)
+                size += sizeof(data->ip);
+        if (sample_type & PERF_SAMPLE_ADDR)
+                size += sizeof(data->addr);
+        if (sample_type & PERF_SAMPLE_PERIOD)
+                size += sizeof(data->period);
+        if (sample_type & PERF_SAMPLE_READ)
+                size += event->read_size;
+        event->header_size = size;
+}
+static void perf_event__id_header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        if (sample_type & PERF_SAMPLE_TID)
+                size += sizeof(data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                size += sizeof(data->time);
+        if (sample_type & PERF_SAMPLE_ID)
+                size += sizeof(data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                size += sizeof(data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                size += sizeof(data->cpu_entry);
+        event->id_header_size = size;
+}
 static void perf_group_attach(struct perf_event *event)
 {
-        struct perf_event *group_leader = event->group_leader;
+        struct perf_event *group_leader = event->group_leader, *pos;
+        /*
+         * We can have double attach due to group movement in perf_event_open.
+         */
+        if (event->attach_state & PERF_ATTACH_GROUP)
+                return;
-        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
        event->attach_state |= PERF_ATTACH_GROUP;
        if (group_leader == event)
@@ -323,6 +921,11 @@ static void perf_group_attach(struct perf_event *event)
        list_add_tail(&event->group_entry, &group_leader->sibling_list);
        group_leader->nr_siblings++;
+        perf_event__header_size(group_leader);
+        list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+                perf_event__header_size(pos);
 }
 /*
@@ -332,6 +935,7 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+        struct perf_cpu_context *cpuctx;
        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
@@ -340,6 +944,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        event->attach_state &= ~PERF_ATTACH_CONTEXT;
+        if (is_cgroup_event(event)) {
+                ctx->nr_cgroups--;
+                cpuctx = __get_cpu_context(ctx);
+                /*
+                 * if there are no more cgroup events
+                 * then cler cgrp to avoid stale pointer
+                 * in update_cgrp_time_from_cpuctx()
+                 */
+                if (!ctx->nr_cgroups)
+                        cpuctx->cgrp = NULL;
+        }
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -381,7 +997,7 @@ static void perf_group_detach(struct perf_event *event)
        if (event->group_leader != event) {
                list_del_init(&event->group_entry);
                event->group_leader->nr_siblings--;
-                return;
+                goto out;
        }
        if (!list_empty(&event->group_entry))
@@ -400,12 +1016,19 @@ static void perf_group_detach(struct perf_event *event)
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
        }
+out:
+        perf_event__header_size(event->group_leader);
+        list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+                perf_event__header_size(tmp);
 }
 static inline int
 event_filter_match(struct perf_event *event)
 {
-        return event->cpu == -1 || event->cpu == smp_processor_id();
+        return (event->cpu == -1 || event->cpu == smp_processor_id())
+            && perf_cgroup_match(event);
 }
 static void
@@ -413,6 +1036,7 @@ event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        u64 delta;
        /*
         * An event which could not be activated because of
@@ -422,9 +1046,9 @@ event_sched_out(struct perf_event *event,
         */
        if (event->state == PERF_EVENT_STATE_INACTIVE
            && !event_filter_match(event)) {
-                delta = ctx->time - event->tstamp_stopped;
+                delta = tstamp - event->tstamp_stopped;
                event->tstamp_running += delta;
-                event->tstamp_stopped = ctx->time;
+                event->tstamp_stopped = tstamp;
        }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -435,8 +1059,8 @@ event_sched_out(struct perf_event *event,
                event->pending_disable = 0;
                event->state = PERF_EVENT_STATE_OFF;
        }
-        event->tstamp_stopped = ctx->time;
+        event->tstamp_stopped = tstamp;
-        event->pmu->disable(event);
+        event->pmu->del(event, 0);
        event->oncpu = -1;
        if (!is_software_event(event))
@@ -472,51 +1096,24 @@ group_sched_out(struct perf_event *group_event,
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
-static void __perf_event_remove_from_context(void *info)
+static int __perf_remove_from_context(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-        /*
-         * If this is a task context, we need to check whether it is
-         * the current task context of this cpu. If not it has been
-         * scheduled out before the smp call arrived.
-         */
-        if (ctx->task && cpuctx->task_ctx != ctx)
-                return;
        raw_spin_lock(&ctx->lock);
-        /*
-         * Protect the list operation against NMI by disabling the
-         * events on a global level.
-         */
-        perf_disable();
        event_sched_out(event, cpuctx, ctx);
        list_del_event(event, ctx);
-        if (!ctx->task) {
-                /*
-                 * Allow more per task events with respect to the
-                 * reservation:
-                 */
-                cpuctx->max_pertask =
-                        min(perf_max_events - ctx->nr_events,
-                            perf_max_events - perf_reserved_percpu);
-        }
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
 * Remove the event from a task's (or a CPU's) list of events.
 *
- * Must be called with ctx->mutex held.
- *
 * CPU events are removed with a smp call. For task events we only
 * call when the task is on a CPU.
 *
@@ -527,60 +1124,62 @@ static void __perf_event_remove_from_context(void *info)
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
-static void perf_event_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
+        lockdep_assert_held(&ctx->mutex);
        if (!task) {
                /*
                 * Per cpu events are removed via an smp call and
                 * the removal is always successful.
                 */
-                smp_call_function_single(event->cpu,
+                cpu_function_call(event->cpu, __perf_remove_from_context, event);
-                                         __perf_event_remove_from_context,
-                                         event, 1);
                return;
        }
 retry:
-        task_oncpu_function_call(task, __perf_event_remove_from_context,
+        if (!task_function_call(task, __perf_remove_from_context, event))
-                                 event);
+                return;
        raw_spin_lock_irq(&ctx->lock);
        /*
-         * If the context is active we need to retry the smp call.
+         * If we failed to find a running task, but find the context active now
+         * that we've acquired the ctx->lock, retry.
         */
-        if (ctx->nr_active && !list_empty(&event->group_entry)) {
+        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
        /*
-         * The lock prevents that this context is scheduled in so we
+         * Since the task isn't running, its safe to remove the event, us
-         * can remove the event safely, if the call above did not
+         * holding the ctx->lock ensures the task won't get scheduled in.
-         * succeed.
         */
-        if (!list_empty(&event->group_entry))
+        list_del_event(event, ctx);
-                list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
 /*
 * Cross CPU call to disable a performance event
 */
-static void __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a per-task event, need to check whether this
         * event's task is the current task on this cpu.
+         *
+         * Can trigger due to concurrent perf_event_context_sched_out()
+         * flipping contexts around.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
-                return;
+                return -EINVAL;
        raw_spin_lock(&ctx->lock);
@@ -590,6 +1189,7 @@ static void __perf_event_disable(void *info)
         */
        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
                update_context_time(ctx);
+                update_cgrp_time_from_event(event);
                update_group_times(event);
                if (event == event->group_leader)
                        group_sched_out(event, cpuctx, ctx);
@@ -599,6 +1199,8 @@ static void __perf_event_disable(void *info)
        }
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
@@ -623,13 +1225,13 @@ void perf_event_disable(struct perf_event *event)
                /*
                 * Disable the event on the cpu that it's on
                 */
-                smp_call_function_single(event->cpu, __perf_event_disable,
+                cpu_function_call(event->cpu, __perf_event_disable, event);
-                                         event, 1);
                return;
        }
- retry:
+retry:
-        task_oncpu_function_call(task, __perf_event_disable, event);
+        if (!task_function_call(task, __perf_event_disable, event))
+                return;
        raw_spin_lock_irq(&ctx->lock);
        /*
@@ -637,6 +1239,11 @@ void perf_event_disable(struct perf_event *event)
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
                raw_spin_unlock_irq(&ctx->lock);
+                /*
+                 * Reload the task pointer, it might have been changed by
+                 * a concurrent perf_event_context_sched_out().
+                 */
+                task = ctx->task;
                goto retry;
        }
@@ -648,32 +1255,85 @@ void perf_event_disable(struct perf_event *event)
                update_group_times(event);
                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+static void perf_set_shadow_time(struct perf_event *event,
+                                 struct perf_event_context *ctx,
+                                 u64 tstamp)
+{
+        /*
+         * use the correct time source for the time snapshot
+         *
+         * We could get by without this by leveraging the
+         * fact that to get to this function, the caller
+         * has most likely already called update_context_time()
+         * and update_cgrp_time_xx() and thus both timestamp
+         * are identical (or very close). Given that tstamp is,
+         * already adjusted for cgroup, we could say that:
+         *    tstamp - ctx->timestamp
+         * is equivalent to
+         *    tstamp - cgrp->timestamp.
+         *
+         * Then, in perf_output_read(), the calculation would
+         * work with no changes because:
+         * - event is guaranteed scheduled in
+         * - no scheduled out in between
+         * - thus the timestamp would be the same
+         *
+         * But this is a bit hairy.
+         *
+         * So instead, we have an explicit cgroup call to remain
+         * within the time time source all along. We believe it
+         * is cleaner and simpler to understand.
+         */
+        if (is_cgroup_event(event))
+                perf_cgroup_set_shadow_time(event, tstamp);
+        else
+                event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_event *event, int enable);
 static int
 event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
        event->state = PERF_EVENT_STATE_ACTIVE;
        event->oncpu = smp_processor_id();
+        /*
+         * Unthrottle events, since we scheduled we might have missed several
+         * ticks already, also for a heavily scheduling task there is little
+         * guarantee it'll get a tick in a timely manner.
+         */
+        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+                perf_log_throttle(event, 1);
+                event->hw.interrupts = 0;
+        }
        /*
         * The new state must be visible before we turn it on in the hardware:
         */
        smp_wmb();
-        if (event->pmu->enable(event)) {
+        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
                return -EAGAIN;
        }
-        event->tstamp_running += ctx->time - event->tstamp_stopped;
+        event->tstamp_running += tstamp - event->tstamp_stopped;
+        perf_set_shadow_time(event, ctx, tstamp);
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
@@ -691,22 +1351,17 @@ group_sched_in(struct perf_event *group_event,
               struct perf_event_context *ctx)
 {
        struct perf_event *event, *partial_group = NULL;
-        const struct pmu *pmu = group_event->pmu;
+        struct pmu *pmu = group_event->pmu;
-        bool txn = false;
+        u64 now = ctx->time;
+        bool simulate = false;
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        /* Check if group transaction availabe */
+        pmu->start_txn(pmu);
-        if (pmu->start_txn)
-                txn = true;
-        if (txn)
-                pmu->start_txn(pmu);
        if (event_sched_in(group_event, cpuctx, ctx)) {
-                if (txn)
+                pmu->cancel_txn(pmu);
-                        pmu->cancel_txn(pmu);
                return -EAGAIN;
        }
@@ -720,23 +1375,38 @@ group_sched_in(struct perf_event *group_event,
                }
        }
-        if (!txn || !pmu->commit_txn(pmu))
+        if (!pmu->commit_txn(pmu))
                return 0;
 group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
+         * The events up to the failed event are scheduled out normally,
+         * tstamp_stopped will be updated.
+         *
+         * The failed events and the remaining siblings need to have
+         * their timings updated as if they had gone thru event_sched_in()
+         * and event_sched_out(). This is required to get consistent timings
+         * across the group. This also takes care of the case where the group
+         * could never be scheduled by ensuring tstamp_stopped is set to mark
+         * the time the event was actually stopped, such that time delta
+         * calculation in update_event_times() is correct.
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                if (event == partial_group)
-                        break;
+                        simulate = true;
-                event_sched_out(event, cpuctx, ctx);
+                if (simulate) {
+                        event->tstamp_running += now - event->tstamp_stopped;
+                        event->tstamp_stopped = now;
+                } else {
+                        event_sched_out(event, cpuctx, ctx);
+                }
        }
        event_sched_out(group_event, cpuctx, ctx);
-        if (txn)
+        pmu->cancel_txn(pmu);
-                pmu->cancel_txn(pmu);
        return -EAGAIN;
 }
@@ -775,52 +1445,52 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        list_add_event(event, ctx);
        perf_group_attach(event);
-        event->tstamp_enabled = ctx->time;
+        event->tstamp_enabled = tstamp;
-        event->tstamp_running = ctx->time;
+        event->tstamp_running = tstamp;
-        event->tstamp_stopped = ctx->time;
+        event->tstamp_stopped = tstamp;
 }
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+                                        struct task_struct *tsk);
 /*
 * Cross CPU call to install and enable a performance event
 *
 * Must be called with ctx->mutex held
 */
-static void __perf_install_in_context(void *info)
+static int  __perf_install_in_context(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
        /*
-         * If this is a task context, we need to check whether it is
+         * In case we're installing a new context to an already running task,
-         * the current task context of this cpu. If not it has been
+         * could also happen before perf_event_task_sched_in() on architectures
-         * scheduled out before the smp call arrived.
+         * which do context switches with IRQs enabled.
-         * Or possibly this is the right context but it isn't
-         * on this cpu because it had no events.
         */
-        if (ctx->task && cpuctx->task_ctx != ctx) {
+        if (ctx->task && !cpuctx->task_ctx)
-                if (cpuctx->task_ctx || ctx->task != current)
+                perf_event_context_sched_in(ctx, ctx->task);
-                        return;
-                cpuctx->task_ctx = ctx;
-        }
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
        /*
-         * Protect the list operation against NMI by disabling the
+         * update cgrp time only if current cgrp
-         * events on a global level. NOP for non NMI based events.
+         * matches event->cgrp. Must be done before
+         * calling add_event_to_ctx()
         */
-        perf_disable();
+        update_cgrp_time_from_event(event);
        add_event_to_ctx(event, ctx);
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                goto unlock;
        /*
@@ -855,13 +1525,10 @@ static void __perf_install_in_context(void *info)
                }
        }
-        if (!err && !ctx->task && cpuctx->max_pertask)
+unlock:
-                cpuctx->max_pertask--;
- unlock:
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
@@ -873,8 +1540,6 @@ static void __perf_install_in_context(void *info)
 * If the event is attached to a task which is on a CPU we use a smp
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
 */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
@@ -883,36 +1548,38 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
        struct task_struct *task = ctx->task;
+        lockdep_assert_held(&ctx->mutex);
+        event->ctx = ctx;
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
                 * the install is always successful.
                 */
-                smp_call_function_single(cpu, __perf_install_in_context,
+                cpu_function_call(cpu, __perf_install_in_context, event);
-                                         event, 1);
                return;
        }
 retry:
-        task_oncpu_function_call(task, __perf_install_in_context,
+        if (!task_function_call(task, __perf_install_in_context, event))
-                                 event);
+                return;
        raw_spin_lock_irq(&ctx->lock);
        /*
-         * we need to retry the smp call.
+         * If we failed to find a running task, but find the context active now
+         * that we've acquired the ctx->lock, retry.
         */
-        if (ctx->is_active && list_empty(&event->group_entry)) {
+        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
        /*
-         * The lock prevents that this context is scheduled in so we
+         * Since the task isn't running, its safe to add the event, us holding
-         * can add the event safely, if it the call above did not
+         * the ctx->lock ensures the task won't get scheduled in.
-         * succeed.
         */
-        if (list_empty(&event->group_entry))
+        add_event_to_ctx(event, ctx);
-                add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -928,46 +1595,48 @@ static void __perf_event_mark_enabled(struct perf_event *event,
                                        struct perf_event_context *ctx)
 {
        struct perf_event *sub;
+        u64 tstamp = perf_event_time(event);
        event->state = PERF_EVENT_STATE_INACTIVE;
-        event->tstamp_enabled = ctx->time - event->total_time_enabled;
+        event->tstamp_enabled = tstamp - event->total_time_enabled;
-        list_for_each_entry(sub, &event->sibling_list, group_entry)
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                        sub->tstamp_enabled =
+                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-                                ctx->time - sub->total_time_enabled;
+        }
 }
 /*
 * Cross CPU call to enable a performance event
 */
-static void __perf_event_enable(void *info)
+static int __perf_event_enable(void *info)
 {
        struct perf_event *event = info;
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
-        /*
+        if (WARN_ON_ONCE(!ctx->is_active))
-         * If this is a per-task event, need to check whether this
+                return -EINVAL;
-         * event's task is the current task on this cpu.
-         */
-        if (ctx->task && cpuctx->task_ctx != ctx) {
-                if (cpuctx->task_ctx || ctx->task != current)
-                        return;
-                cpuctx->task_ctx = ctx;
-        }
        raw_spin_lock(&ctx->lock);
-        ctx->is_active = 1;
        update_context_time(ctx);
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                goto unlock;
+        /*
+         * set current task's cgroup time reference point
+         */
+        perf_cgroup_set_timestamp(current, ctx);
        __perf_event_mark_enabled(event, ctx);
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event)) {
+                if (is_cgroup_event(event))
+                        perf_cgroup_defer_enabled(event);
                goto unlock;
+        }
        /*
         * If the event is in a group and isn't the group leader,
@@ -979,12 +1648,10 @@ static void __perf_event_enable(void *info)
        if (!group_can_go_on(event, cpuctx, 1)) {
                err = -EEXIST;
        } else {
-                perf_disable();
                if (event == leader)
                        err = group_sched_in(event, cpuctx, ctx);
                else
                        err = event_sched_in(event, cpuctx, ctx);
-                perf_enable();
        }
        if (err) {
@@ -1000,8 +1667,10 @@ static void __perf_event_enable(void *info)
                }
        }
- unlock:
+unlock:
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
@@ -1022,8 +1691,7 @@ void perf_event_enable(struct perf_event *event)
                /*
                 * Enable the event on the cpu that it's on
                 */
-                smp_call_function_single(event->cpu, __perf_event_enable,
+                cpu_function_call(event->cpu, __perf_event_enable, event);
-                                         event, 1);
                return;
        }
@@ -1041,9 +1709,16 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_ERROR)
                event->state = PERF_EVENT_STATE_OFF;
- retry:
+retry:
+        if (!ctx->is_active) {
+                __perf_event_mark_enabled(event, ctx);
+                goto out;
+        }
        raw_spin_unlock_irq(&ctx->lock);
-        task_oncpu_function_call(task, __perf_event_enable, event);
+        if (!task_function_call(task, __perf_event_enable, event))
+                return;
        raw_spin_lock_irq(&ctx->lock);
@@ -1051,17 +1726,16 @@ void perf_event_enable(struct perf_event *event)
         * If the context is active and the event is still off,
         * we need to retry the cross-call.
         */
-        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+                /*
+                 * task could have been flipped by a concurrent
+                 * perf_event_context_sched_out()
+                 */
+                task = ctx->task;
                goto retry;
+        }
-        /*
+out:
-         * Since we have the lock this context can't be scheduled
-         * in, so we can change the state safely.
-         */
-        if (event->state == PERF_EVENT_STATE_OFF)
-                __perf_event_mark_enabled(event, ctx);
- out:
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -1070,7 +1744,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        /*
         * not supported on inherited events
         */
-        if (event->attr.inherit)
+        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;
        atomic_add(refresh, &event->event_limit);
@@ -1079,12 +1753,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
-enum event_type_t {
-        EVENT_FLEXIBLE = 0x1,
-        EVENT_PINNED = 0x2,
-        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
@@ -1092,26 +1760,27 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        struct perf_event *event;
        raw_spin_lock(&ctx->lock);
+        perf_pmu_disable(ctx->pmu);
        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
                goto out;
        update_context_time(ctx);
+        update_cgrp_time_from_cpuctx(cpuctx);
-        perf_disable();
        if (!ctx->nr_active)
-                goto out_enable;
+                goto out;
-        if (event_type & EVENT_PINNED)
+        if (event_type & EVENT_PINNED) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
-        if (event_type & EVENT_FLEXIBLE)
+        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
- out_enable:
+out:
-        perf_enable();
+        perf_pmu_enable(ctx->pmu);
- out:
        raw_spin_unlock(&ctx->lock);
 }
@@ -1209,34 +1878,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        }
 }
-/*
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- * Called from scheduler to remove the events of the current task,
+                                         struct task_struct *next)
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-                                 struct task_struct *next)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
-        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
+        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+        if (likely(!ctx))
+                return;
-        if (likely(!ctx || !cpuctx->task_ctx))
+        cpuctx = __get_cpu_context(ctx);
+        if (!cpuctx->task_ctx)
                return;
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
-        next_ctx = next->perf_event_ctxp;
+        next_ctx = next->perf_event_ctxp[ctxn];
        if (parent && next_ctx &&
            rcu_dereference(next_ctx->parent_ctx) == parent) {
                /*
@@ -1255,8 +1915,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                         * XXX do we need a memory barrier of sorts
                         * wrt to rcu_dereference() of perf_event_ctxp
                         */
-                        task->perf_event_ctxp = next_ctx;
+                        task->perf_event_ctxp[ctxn] = next_ctx;
-                        next->perf_event_ctxp = ctx;
+                        next->perf_event_ctxp[ctxn] = ctx;
                        ctx->task = next;
                        next_ctx->task = task;
                        do_switch = 0;
@@ -1274,10 +1934,41 @@ void perf_event_task_sched_out(struct task_struct *task,
        }
 }
+#define for_each_task_context_nr(ctxn)                                  \
+        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+                                 struct task_struct *next)
+{
+        int ctxn;
+        for_each_task_context_nr(ctxn)
+                perf_event_context_sched_out(task, ctxn, next);
+        /*
+         * if cgroup events exist on this CPU, then we need
+         * to check if we have to switch out PMU state.
+         * cgroup event are system-wide mode only
+         */
+        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+                perf_cgroup_sched_out(task);
+}
 static void task_ctx_sched_out(struct perf_event_context *ctx,
                               enum event_type_t event_type)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        if (!cpuctx->task_ctx)
                return;
@@ -1292,14 +1983,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 /*
 * Called with IRQs disabled
 */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
-        task_ctx_sched_out(ctx, EVENT_ALL);
-}
-/*
- * Called with IRQs disabled
- */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type)
 {
@@ -1315,9 +1998,13 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
                if (event->state <= PERF_EVENT_STATE_OFF)
                        continue;
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
+                /* may need to reset tstamp_enabled */
+                if (is_cgroup_event(event))
+                        perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, 1))
                        group_sched_in(event, cpuctx, ctx);
@@ -1347,29 +2034,36 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
-                if (group_can_go_on(event, cpuctx, can_add_hw))
+                /* may need to reset tstamp_enabled */
+                if (is_cgroup_event(event))
+                        perf_cgroup_mark_enabled(event, ctx);
+                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
+                }
        }
 }
 static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
-             enum event_type_t event_type)
+             enum event_type_t event_type,
+             struct task_struct *task)
 {
+        u64 now;
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
                goto out;
-        ctx->timestamp = perf_clock();
+        now = perf_clock();
+        ctx->timestamp = now;
-        perf_disable();
+        perf_cgroup_set_timestamp(task, ctx);
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
@@ -1381,56 +2075,42 @@ ctx_sched_in(struct perf_event_context *ctx,
        if (event_type & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
-        perf_enable();
+out:
- out:
        raw_spin_unlock(&ctx->lock);
 }
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
+                             enum event_type_t event_type,
+                             struct task_struct *task)
 {
        struct perf_event_context *ctx = &cpuctx->ctx;
-        ctx_sched_in(ctx, cpuctx, event_type);
+        ctx_sched_in(ctx, cpuctx, event_type, task);
 }
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
                              enum event_type_t event_type)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx = task->perf_event_ctxp;
-        if (likely(!ctx))
+        cpuctx = __get_cpu_context(ctx);
-                return;
        if (cpuctx->task_ctx == ctx)
                return;
-        ctx_sched_in(ctx, cpuctx, event_type);
+        ctx_sched_in(ctx, cpuctx, event_type, NULL);
        cpuctx->task_ctx = ctx;
 }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
-{
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-        struct perf_event_context *ctx = task->perf_event_ctxp;
-        if (likely(!ctx))
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
-                return;
+                                        struct task_struct *task)
+{
+        struct perf_cpu_context *cpuctx;
+        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx)
                return;
-        perf_disable();
+        perf_pmu_disable(ctx->pmu);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1438,18 +2118,51 @@ void perf_event_task_sched_in(struct task_struct *task)
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+        ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
-        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
        cpuctx->task_ctx = ctx;
-        perf_enable();
+        /*
+         * Since these rotations are per-cpu, we need to ensure the
+         * cpu-context we got scheduled on is actually rotating.
+         */
+        perf_pmu_rotate_start(ctx->pmu);
+        perf_pmu_enable(ctx->pmu);
 }
-#define MAX_INTERRUPTS (~0ULL)
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *task)
+{
+        struct perf_event_context *ctx;
+        int ctxn;
-static void perf_log_throttle(struct perf_event *event, int enable);
+        for_each_task_context_nr(ctxn) {
+                ctx = task->perf_event_ctxp[ctxn];
+                if (likely(!ctx))
+                        continue;
+                perf_event_context_sched_in(ctx, task);
+        }
+        /*
+         * if cgroup events exist on this CPU, then we need
+         * to check if we have to switch in PMU state.
+         * cgroup event are system-wide mode only
+         */
+        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+                perf_cgroup_sched_in(task);
+}
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
@@ -1478,7 +2191,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
-#define REDUCE_FLS(a, b)                \
+#define REDUCE_FLS(a, b)                \
 do {                                    \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
@@ -1524,22 +2237,6 @@ do {					\
        return div64_u64(dividend, divisor);
 }
-static void perf_event_stop(struct perf_event *event)
-{
-        if (!event->pmu->stop)
-                return event->pmu->disable(event);
-        return event->pmu->stop(event);
-}
-static int perf_event_start(struct perf_event *event)
-{
-        if (!event->pmu->start)
-                return event->pmu->enable(event);
-        return event->pmu->start(event);
-}
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +2256,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
        hwc->sample_period = sample_period;
        if (local64_read(&hwc->period_left) > 8*sample_period) {
-                perf_disable();
+                event->pmu->stop(event, PERF_EF_UPDATE);
-                perf_event_stop(event);
                local64_set(&hwc->period_left, 0);
-                perf_event_start(event);
+                event->pmu->start(event, PERF_EF_RELOAD);
-                perf_enable();
        }
 }
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
@@ -1579,7 +2274,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                hwc = &event->hw;
@@ -1592,23 +2287,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
-                        perf_disable();
+                        event->pmu->start(event, 0);
-                        event->pmu->unthrottle(event);
-                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                perf_disable();
                event->pmu->read(event);
                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;
                if (delta > 0)
-                        perf_adjust_period(event, TICK_NSEC, delta);
+                        perf_adjust_period(event, period, delta);
-                perf_enable();
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1620,38 +2311,48 @@ static void rotate_ctx(struct perf_event_context *ctx)
 {
        raw_spin_lock(&ctx->lock);
-        /* Rotate the first entry last of non-pinned groups */
+        /*
-        list_rotate_left(&ctx->flexible_groups);
+         * Rotate the first entry last of non-pinned groups. Rotation might be
+         * disabled by the inheritance code.
+         */
+        if (!ctx->rotate_disable)
+                list_rotate_left(&ctx->flexible_groups);
        raw_spin_unlock(&ctx->lock);
 }
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
-        struct perf_cpu_context *cpuctx;
+        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
-        struct perf_event_context *ctx;
+        struct perf_event_context *ctx = NULL;
-        int rotate = 0;
+        int rotate = 0, remove = 1;
-        if (!atomic_read(&nr_events))
-                return;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
+        if (cpuctx->ctx.nr_events) {
-        if (cpuctx->ctx.nr_events &&
+                remove = 0;
-            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-                rotate = 1;
+                        rotate = 1;
+        }
-        ctx = curr->perf_event_ctxp;
+        ctx = cpuctx->task_ctx;
-        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+        if (ctx && ctx->nr_events) {
-                rotate = 1;
+                remove = 0;
+                if (ctx->nr_events != ctx->nr_active)
+                        rotate = 1;
+        }
-        perf_ctx_adjust_freq(&cpuctx->ctx);
+        perf_pmu_disable(cpuctx->ctx.pmu);
+        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
        if (ctx)
-                perf_ctx_adjust_freq(ctx);
+                perf_ctx_adjust_freq(ctx, interval);
        if (!rotate)
-                return;
+                goto done;
-        perf_disable();
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1660,10 +2361,29 @@ void perf_event_task_tick(struct task_struct *curr)
        if (ctx)
                rotate_ctx(ctx);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
        if (ctx)
-                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
-        perf_enable();
+done:
+        if (remove)
+                list_del_init(&cpuctx->rotation_list);
+        perf_pmu_enable(cpuctx->ctx.pmu);
+}
+void perf_event_task_tick(void)
+{
+        struct list_head *head = &__get_cpu_var(rotation_list);
+        struct perf_cpu_context *cpuctx, *tmp;
+        WARN_ON(!irqs_disabled());
+        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+                if (cpuctx->jiffies_interval == 1 ||
+                                !(jiffies % cpuctx->jiffies_interval))
+                        perf_rotate_context(cpuctx);
+        }
 }
 static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +2405,26 @@ static int event_enable_on_exec(struct perf_event *event,
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
-        struct perf_event_context *ctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
        int ret;
        local_irq_save(flags);
-        ctx = task->perf_event_ctxp;
        if (!ctx || !ctx->nr_events)
                goto out;
-        __perf_event_task_sched_out(ctx);
+        /*
+         * We must ctxsw out cgroup events to avoid conflict
+         * when invoking perf_task_event_sched_in() later on
+         * in this function. Otherwise we end up trying to
+         * ctxswin cgroup events which are already scheduled
+         * in.
+         */
+        perf_cgroup_sched_out(current);
+        task_ctx_sched_out(ctx, EVENT_ALL);
        raw_spin_lock(&ctx->lock);
@@ -1722,8 +2448,11 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_unlock(&ctx->lock);
-        perf_event_task_sched_in(task);
+        /*
- out:
+         * Also calls ctxswin for cgroup events, if any:
+         */
+        perf_event_context_sched_in(ctx, ctx->task);
+out:
        local_irq_restore(flags);
 }
@@ -1732,9 +2461,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 */
 static void __perf_event_read(void *info)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        /*
         * If this is a task context, we need to check whether it is
@@ -1747,11 +2476,14 @@ static void __perf_event_read(void *info)
                return;
        raw_spin_lock(&ctx->lock);
-        update_context_time(ctx);
+        if (ctx->is_active) {
+                update_context_time(ctx);
+                update_cgrp_time_from_event(event);
+        }
        update_event_times(event);
+        if (event->state == PERF_EVENT_STATE_ACTIVE)
+                event->pmu->read(event);
        raw_spin_unlock(&ctx->lock);
-        event->pmu->read(event);
 }
 static inline u64 perf_event_count(struct perf_event *event)
@@ -1773,7 +2505,15 @@ static u64 perf_event_read(struct perf_event *event)
                unsigned long flags;
                raw_spin_lock_irqsave(&ctx->lock, flags);
-                update_context_time(ctx);
+                /*
+                 * may read while context is not active
+                 * (e.g., thread is blocked), in that case
+                 * we cannot update context time
+                 */
+                if (ctx->is_active) {
+                        update_context_time(ctx);
+                        update_cgrp_time_from_event(event);
+                }
                update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -1782,11 +2522,218 @@ static u64 perf_event_read(struct perf_event *event)
 }
 /*
- * Initialize the perf_event context in a task_struct:
+ * Callchain support
 */
+struct callchain_cpus_entries {
+        struct rcu_head                 rcu_head;
+        struct perf_callchain_entry     *cpu_entries[0];
+};
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
+{
+}
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                                struct pt_regs *regs)
+{
+}
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+        struct callchain_cpus_entries *entries;
+        int cpu;
+        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+}
+static void release_callchain_buffers(void)
+{
+        struct callchain_cpus_entries *entries;
+        entries = callchain_cpus_entries;
+        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+static int alloc_callchain_buffers(void)
+{
+        int cpu;
+        int size;
+        struct callchain_cpus_entries *entries;
+        /*
+         * We can't use the percpu allocation API for data that can be
+         * accessed from NMI. Use a temporary manual per cpu allocation
+         * until that gets sorted out.
+         */
+        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+        entries = kzalloc(size, GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+        for_each_possible_cpu(cpu) {
+                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                         cpu_to_node(cpu));
+                if (!entries->cpu_entries[cpu])
+                        goto fail;
+        }
+        rcu_assign_pointer(callchain_cpus_entries, entries);
+        return 0;
+fail:
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+        return -ENOMEM;
+}
+static int get_callchain_buffers(void)
+{
+        int err = 0;
+        int count;
+        mutex_lock(&callchain_mutex);
+        count = atomic_inc_return(&nr_callchain_events);
+        if (WARN_ON_ONCE(count < 1)) {
+                err = -EINVAL;
+                goto exit;
+        }
+        if (count > 1) {
+                /* If the allocation failed, give up */
+                if (!callchain_cpus_entries)
+                        err = -ENOMEM;
+                goto exit;
+        }
+        err = alloc_callchain_buffers();
+        if (err)
+                release_callchain_buffers();
+exit:
+        mutex_unlock(&callchain_mutex);
+        return err;
+}
+static void put_callchain_buffers(void)
+{
+        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+                release_callchain_buffers();
+                mutex_unlock(&callchain_mutex);
+        }
+}
+static int get_recursion_context(int *recursion)
+{
+        int rctx;
+        if (in_nmi())
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (recursion[rctx])
+                return -1;
+        recursion[rctx]++;
+        barrier();
+        return rctx;
+}
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+        barrier();
+        recursion[rctx]--;
+}
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+        int cpu;
+        struct callchain_cpus_entries *entries;
+        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        if (*rctx == -1)
+                return NULL;
+        entries = rcu_dereference(callchain_cpus_entries);
+        if (!entries)
+                return NULL;
+        cpu = smp_processor_id();
+        return &entries->cpu_entries[cpu][*rctx];
+}
 static void
-__perf_event_init_context(struct perf_event_context *ctx,
+put_callchain_entry(int rctx)
-                            struct task_struct *task)
+{
+        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        int rctx;
+        struct perf_callchain_entry *entry;
+        entry = get_callchain_entry(&rctx);
+        if (rctx == -1)
+                return NULL;
+        if (!entry)
+                goto exit_put;
+        entry->nr = 0;
+        if (!user_mode(regs)) {
+                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                perf_callchain_kernel(entry, regs);
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                perf_callchain_user(entry, regs);
+        }
+exit_put:
+        put_callchain_entry(rctx);
+        return entry;
+}
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
@@ -1794,25 +2741,73 @@ __perf_event_init_context(struct perf_event_context *ctx,
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
-        ctx->task = task;
 }
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
 {
        struct perf_event_context *ctx;
-        struct perf_cpu_context *cpuctx;
+        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+        if (!ctx)
+                return NULL;
+        __perf_event_init_context(ctx);
+        if (task) {
+                ctx->task = task;
+                get_task_struct(task);
+        }
+        ctx->pmu = pmu;
+        return ctx;
+}
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
        struct task_struct *task;
-        unsigned long flags;
        int err;
-        if (pid == -1 && cpu != -1) {
+        rcu_read_lock();
+        if (!vpid)
+                task = current;
+        else
+                task = find_task_by_vpid(vpid);
+        if (task)
+                get_task_struct(task);
+        rcu_read_unlock();
+        if (!task)
+                return ERR_PTR(-ESRCH);
+        /* Reuse ptrace permission checks for now. */
+        err = -EACCES;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto errout;
+        return task;
+errout:
+        put_task_struct(task);
+        return ERR_PTR(err);
+}
+/*
+ * Returns a matching context with refcount and pincount.
+ */
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+        struct perf_event_context *ctx;
+        struct perf_cpu_context *cpuctx;
+        unsigned long flags;
+        int ctxn, err;
+        if (!task) {
                /* Must be root to operate on a CPU event: */
                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu >= nr_cpumask_bits)
-                        return ERR_PTR(-EINVAL);
                /*
                 * We could be clever and allow to attach a event to an
                 * offline CPU and activate it when the CPU comes up, but
@@ -1821,67 +2816,64 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
                if (!cpu_online(cpu))
                        return ERR_PTR(-ENODEV);
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
+                ++ctx->pin_count;
                return ctx;
        }
-        rcu_read_lock();
+        err = -EINVAL;
-        if (!pid)
+        ctxn = pmu->task_ctx_nr;
-                task = current;
+        if (ctxn < 0)
-        else
-                task = find_task_by_vpid(pid);
-        if (task)
-                get_task_struct(task);
-        rcu_read_unlock();
-        if (!task)
-                return ERR_PTR(-ESRCH);
-        /*
-         * Can't attach events to a dying task.
-         */
-        err = -ESRCH;
-        if (task->flags & PF_EXITING)
-                goto errout;
-        /* Reuse ptrace permission checks for now. */
-        err = -EACCES;
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto errout;
- retry:
+retry:
-        ctx = perf_lock_task_context(task, &flags);
+        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                unclone_ctx(ctx);
+                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        if (!ctx) {
-                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
-                __perf_event_init_context(ctx, task);
                get_ctx(ctx);
-                if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
-                        /*
+                err = 0;
-                         * We raced with some other task; use
+                mutex_lock(&task->perf_event_mutex);
-                         * the context they set.
+                /*
-                         */
+                 * If it has already passed perf_event_exit_task().
+                 * we must see PF_EXITING, it takes this mutex too.
+                 */
+                if (task->flags & PF_EXITING)
+                        err = -ESRCH;
+                else if (task->perf_event_ctxp[ctxn])
+                        err = -EAGAIN;
+                else {
+                        ++ctx->pin_count;
+                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+                }
+                mutex_unlock(&task->perf_event_mutex);
+                if (unlikely(err)) {
+                        put_task_struct(task);
                        kfree(ctx);
-                        goto retry;
+                        if (err == -EAGAIN)
+                                goto retry;
+                        goto errout;
                }
-                get_task_struct(task);
        }
-        put_task_struct(task);
        return ctx;
- errout:
+errout:
-        put_task_struct(task);
        return ERR_PTR(err);
 }
@@ -1898,21 +2890,27 @@ static void free_event_rcu(struct rcu_head *head)
        kfree(event);
 }
-static void perf_pending_sync(struct perf_event *event);
 static void perf_buffer_put(struct perf_buffer *buffer);
 static void free_event(struct perf_event *event)
 {
-        perf_pending_sync(event);
+        irq_work_sync(&event->pending);
        if (!event->parent) {
-                atomic_dec(&nr_events);
+                if (event->attach_state & PERF_ATTACH_TASK)
+                        jump_label_dec(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
                        atomic_dec(&nr_comm_events);
                if (event->attr.task)
                        atomic_dec(&nr_task_events);
+                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                        put_callchain_buffers();
+                if (is_cgroup_event(event)) {
+                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+                        jump_label_dec(&perf_sched_events);
+                }
        }
        if (event->buffer) {
@@ -1920,10 +2918,15 @@ static void free_event(struct perf_event *event)
                event->buffer = NULL;
        }
+        if (is_cgroup_event(event))
+                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
-        put_ctx(event->ctx);
+        if (event->ctx)
+                put_ctx(event->ctx);
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -1957,11 +2960,6 @@ int perf_event_release_kernel(struct perf_event *event)
        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
-        mutex_lock(&event->owner->perf_event_mutex);
-        list_del_init(&event->owner_entry);
-        mutex_unlock(&event->owner->perf_event_mutex);
-        put_task_struct(event->owner);
        free_event(event);
        return 0;
@@ -1974,35 +2972,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
        struct perf_event *event = file->private_data;
+        struct task_struct *owner;
        file->private_data = NULL;
-        return perf_event_release_kernel(event);
+        rcu_read_lock();
-}
+        owner = ACCESS_ONCE(event->owner);
+        /*
-static int perf_event_read_size(struct perf_event *event)
+         * Matches the smp_wmb() in perf_event_exit_task(). If we observe
-{
+         * !owner it means the list deletion is complete and we can indeed
-        int entry = sizeof(u64); /* value */
+         * free this event, otherwise we need to serialize on
-        int size = 0;
+         * owner->perf_event_mutex.
-        int nr = 1;
+         */
+        smp_read_barrier_depends();
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+        if (owner) {
-                size += sizeof(u64);
+                /*
+                 * Since delayed_put_task_struct() also drops the last
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                 * task reference we can safely take a new reference
-                size += sizeof(u64);
+                 * while holding the rcu_read_lock().
+                 */
-        if (event->attr.read_format & PERF_FORMAT_ID)
+                get_task_struct(owner);
-                entry += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-                nr += event->group_leader->nr_siblings;
-                size += sizeof(u64);
        }
+        rcu_read_unlock();
-        size += entry * nr;
+        if (owner) {
+                mutex_lock(&owner->perf_event_mutex);
+                /*
+                 * We have to re-check the event->owner field, if it is cleared
+                 * we raced with perf_event_exit_task(), acquiring the mutex
+                 * ensured they're done, and we can proceed with freeing the
+                 * event.
+                 */
+                if (event->owner)
+                        list_del_init(&event->owner_entry);
+                mutex_unlock(&owner->perf_event_mutex);
+                put_task_struct(owner);
+        }
-        return size;
+        return perf_event_release_kernel(event);
 }
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2119,7 +3126,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;
-        if (count < perf_event_read_size(event))
+        if (count < event->read_size)
                return -ENOSPC;
        WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2205,7 +3212,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        int ret = 0;
        u64 value;
-        if (!event->attr.sample_period)
+        if (!is_sampling_event(event))
                return -EINVAL;
        if (copy_from_user(&value, arg, sizeof(value)))
@@ -2342,6 +3349,9 @@ int perf_event_task_disable(void)
 static int perf_event_index(struct perf_event *event)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
@@ -2845,16 +3855,7 @@ void perf_event_wakeup(struct perf_event *event)
        }
 }
-/*
+static void perf_pending_event(struct irq_work *entry)
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-static void perf_pending_event(struct perf_pending_entry *entry)
 {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
@@ -2870,99 +3871,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
        }
 }
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-        PENDING_TAIL,
-};
-static void perf_pending_queue(struct perf_pending_entry *entry,
-                               void (*func)(struct perf_pending_entry *))
-{
-        struct perf_pending_entry **head;
-        if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-                return;
-        entry->func = func;
-        head = &get_cpu_var(perf_pending_head);
-        do {
-                entry->next = *head;
-        } while (cmpxchg(head, entry->next, entry) != entry->next);
-        set_perf_event_pending();
-        put_cpu_var(perf_pending_head);
-}
-static int __perf_pending_run(void)
-{
-        struct perf_pending_entry *list;
-        int nr = 0;
-        list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-        while (list != PENDING_TAIL) {
-                void (*func)(struct perf_pending_entry *);
-                struct perf_pending_entry *entry = list;
-                list = list->next;
-                func = entry->func;
-                entry->next = NULL;
-                /*
-                 * Ensure we observe the unqueue before we issue the wakeup,
-                 * so that we won't be waiting forever.
-                 * -- see perf_not_pending().
-                 */
-                smp_wmb();
-                func(entry);
-                nr++;
-        }
-        return nr;
-}
-static inline int perf_not_pending(struct perf_event *event)
-{
-        /*
-         * If we flush on whatever cpu we run, there is a chance we don't
-         * need to wait.
-         */
-        get_cpu();
-        __perf_pending_run();
-        put_cpu();
-        /*
-         * Ensure we see the proper queue state before going to sleep
-         * so that we do not miss the wakeup. -- see perf_pending_handle()
-         */
-        smp_rmb();
-        return event->pending.next == NULL;
-}
-static void perf_pending_sync(struct perf_event *event)
-{
-        wait_event(event->waitq, perf_not_pending(event));
-}
-void perf_event_do_pending(void)
-{
-        __perf_pending_run();
-}
-/*
- * Callchain support -- arch specific
- */
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-        return NULL;
-}
 /*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
@@ -3012,8 +3920,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
        if (handle->nmi) {
                handle->event->pending_wakeup = 1;
-                perf_pending_queue(&handle->event->pending,
+                irq_work_queue(&handle->event->pending);
-                                   perf_pending_event);
        } else
                perf_event_wakeup(handle->event);
 }
@@ -3069,7 +3976,7 @@ again:
        if (handle->wakeup != local_read(&buffer->wakeup))
                perf_output_wakeup(handle);
- out:
+out:
        preempt_enable();
 }
@@ -3096,6 +4003,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
        } while (len);
 }
+static void __perf_event_header__init_id(struct perf_event_header *header,
+                                         struct perf_sample_data *data,
+                                         struct perf_event *event)
+{
+        u64 sample_type = event->attr.sample_type;
+        data->type = sample_type;
+        header->size += event->id_header_size;
+        if (sample_type & PERF_SAMPLE_TID) {
+                /* namespace issues */
+                data->tid_entry.pid = perf_event_pid(event, current);
+                data->tid_entry.tid = perf_event_tid(event, current);
+        }
+        if (sample_type & PERF_SAMPLE_TIME)
+                data->time = perf_clock();
+        if (sample_type & PERF_SAMPLE_ID)
+                data->id = primary_event_id(event);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                data->stream_id = event->id;
+        if (sample_type & PERF_SAMPLE_CPU) {
+                data->cpu_entry.cpu      = raw_smp_processor_id();
+                data->cpu_entry.reserved = 0;
+        }
+}
+static void perf_event_header__init_id(struct perf_event_header *header,
+                                       struct perf_sample_data *data,
+                                       struct perf_event *event)
+{
+        if (event->attr.sample_id_all)
+                __perf_event_header__init_id(header, data, event);
+}
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+                                           struct perf_sample_data *data)
+{
+        u64 sample_type = data->type;
+        if (sample_type & PERF_SAMPLE_TID)
+                perf_output_put(handle, data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                perf_output_put(handle, data->time);
+        if (sample_type & PERF_SAMPLE_ID)
+                perf_output_put(handle, data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                perf_output_put(handle, data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                perf_output_put(handle, data->cpu_entry);
+}
+static void perf_event__output_id_sample(struct perf_event *event,
+                                         struct perf_output_handle *handle,
+                                         struct perf_sample_data *sample)
+{
+        if (event->attr.sample_id_all)
+                __perf_event__output_id_sample(handle, sample);
+}
 int perf_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event, unsigned int size,
                      int nmi, int sample)
@@ -3103,6 +4077,7 @@ int perf_output_begin(struct perf_output_handle *handle,
        struct perf_buffer *buffer;
        unsigned long tail, offset, head;
        int have_lost;
+        struct perf_sample_data sample_data;
        struct {
                struct perf_event_header header;
                u64                      id;
@@ -3129,8 +4104,12 @@ int perf_output_begin(struct perf_output_handle *handle,
                goto out;
        have_lost = local_read(&buffer->lost);
-        if (have_lost)
+        if (have_lost) {
-                size += sizeof(lost_event);
+                lost_event.header.size = sizeof(lost_event);
+                perf_event_header__init_id(&lost_event.header, &sample_data,
+                                           event);
+                size += lost_event.header.size;
+        }
        perf_output_get_handle(handle);
@@ -3161,11 +4140,11 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
-                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
                lost_event.lost        = local_xchg(&buffer->lost, 0);
                perf_output_put(handle, lost_event);
+                perf_event__output_id_sample(event, handle, &sample_data);
        }
        return 0;
@@ -3198,30 +4177,9 @@ void perf_output_end(struct perf_output_handle *handle)
        rcu_read_unlock();
 }
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
-        /*
-         * only top level events have the pid namespace they were created in
-         */
-        if (event->parent)
-                event = event->parent;
-        return task_tgid_nr_ns(p, event->ns);
-}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
-        /*
-         * only top level events have the pid namespace they were created in
-         */
-        if (event->parent)
-                event = event->parent;
-        return task_pid_nr_ns(p, event->ns);
-}
 static void perf_output_read_one(struct perf_output_handle *handle,
-                                 struct perf_event *event)
+                                 struct perf_event *event,
+                                 u64 enabled, u64 running)
 {
        u64 read_format = event->attr.read_format;
        u64 values[4];
@@ -3229,11 +4187,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-                values[n++] = event->total_time_enabled +
+                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-                values[n++] = event->total_time_running +
+                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
@@ -3246,7 +4204,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
 */
 static void perf_output_read_group(struct perf_output_handle *handle,
-                            struct perf_event *event)
+                            struct perf_event *event,
+                            u64 enabled, u64 running)
 {
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
@@ -3256,10 +4215,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        values[n++] = 1 + leader->nr_siblings;
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = leader->total_time_enabled;
+                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-                values[n++] = leader->total_time_running;
+                values[n++] = running;
        if (leader != event)
                leader->pmu->read(leader);
@@ -3284,13 +4243,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        }
 }
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
+        u64 enabled = 0, running = 0, now, ctx_time;
+        u64 read_format = event->attr.read_format;
+        /*
+         * compute total_time_enabled, total_time_running
+         * based on snapshot values taken when the event
+         * was last scheduled in.
+         *
+         * we cannot simply called update_context_time()
+         * because of locking issue as we are called in
+         * NMI context
+         */
+        if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+                now = perf_clock();
+                ctx_time = event->shadow_ctx_time + now;
+                enabled = ctx_time - event->tstamp_enabled;
+                running = ctx_time - event->tstamp_running;
+        }
        if (event->attr.read_format & PERF_FORMAT_GROUP)
-                perf_output_read_group(handle, event);
+                perf_output_read_group(handle, event, enabled, running);
        else
-                perf_output_read_one(handle, event);
+                perf_output_read_one(handle, event, enabled, running);
 }
 void perf_output_sample(struct perf_output_handle *handle,
@@ -3370,61 +4351,16 @@ void perf_prepare_sample(struct perf_event_header *header,
 {
        u64 sample_type = event->attr.sample_type;
-        data->type = sample_type;
        header->type = PERF_RECORD_SAMPLE;
-        header->size = sizeof(*header);
+        header->size = sizeof(*header) + event->header_size;
        header->misc = 0;
        header->misc |= perf_misc_flags(regs);
-        if (sample_type & PERF_SAMPLE_IP) {
+        __perf_event_header__init_id(header, data, event);
-                data->ip = perf_instruction_pointer(regs);
-                header->size += sizeof(data->ip);
-        }
-        if (sample_type & PERF_SAMPLE_TID) {
-                /* namespace issues */
-                data->tid_entry.pid = perf_event_pid(event, current);
-                data->tid_entry.tid = perf_event_tid(event, current);
-                header->size += sizeof(data->tid_entry);
-        }
-        if (sample_type & PERF_SAMPLE_TIME) {
-                data->time = perf_clock();
-                header->size += sizeof(data->time);
-        }
-        if (sample_type & PERF_SAMPLE_ADDR)
-                header->size += sizeof(data->addr);
-        if (sample_type & PERF_SAMPLE_ID) {
-                data->id = primary_event_id(event);
-                header->size += sizeof(data->id);
-        }
-        if (sample_type & PERF_SAMPLE_STREAM_ID) {
-                data->stream_id = event->id;
-                header->size += sizeof(data->stream_id);
-        }
-        if (sample_type & PERF_SAMPLE_CPU) {
-                data->cpu_entry.cpu             = raw_smp_processor_id();
-                data->cpu_entry.reserved        = 0;
-                header->size += sizeof(data->cpu_entry);
+        if (sample_type & PERF_SAMPLE_IP)
-        }
+                data->ip = perf_instruction_pointer(regs);
-        if (sample_type & PERF_SAMPLE_PERIOD)
-                header->size += sizeof(data->period);
-        if (sample_type & PERF_SAMPLE_READ)
-                header->size += perf_event_read_size(event);
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
@@ -3457,14 +4393,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
        struct perf_output_handle handle;
        struct perf_event_header header;
+        /* protect the callchain buffers */
+        rcu_read_lock();
        perf_prepare_sample(&header, data, event, regs);
        if (perf_output_begin(&handle, event, header.size, nmi, 1))
-                return;
+                goto exit;
        perf_output_sample(&handle, &header, data, event);
        perf_output_end(&handle);
+exit:
+        rcu_read_unlock();
 }
 /*
@@ -3483,23 +4425,26 @@ perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
-                        .size = sizeof(read_event) + perf_event_read_size(event),
+                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;
+        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
        if (ret)
                return;
        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
 }
@@ -3529,14 +4474,16 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        struct task_struct *task = task_event->task;
-        int size, ret;
+        int ret, size = task_event->event_id.header.size;
-        size  = task_event->event_id.header.size;
+        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
-        ret = perf_output_begin(&handle, event, size, 0, 0);
+        ret = perf_output_begin(&handle, event,
+                                task_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3546,7 +4493,11 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        task_event->event_id.header.size = size;
 }
 static int perf_event_task_match(struct perf_event *event)
@@ -3554,7 +4505,7 @@ static int perf_event_task_match(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if (event->attr.comm || event->attr.mmap ||
@@ -3578,16 +4529,29 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx = task_event->task_ctx;
+        struct perf_event_context *ctx;
+        struct pmu *pmu;
+        int ctxn;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_task_ctx(&cpuctx->ctx, task_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        if (!ctx)
+                if (cpuctx->active_pmu != pmu)
-                ctx = rcu_dereference(current->perf_event_ctxp);
+                        goto next;
-        if (ctx)
+                perf_event_task_ctx(&cpuctx->ctx, task_event);
-                perf_event_task_ctx(ctx, task_event);
-        put_cpu_var(perf_cpu_context);
+                ctx = task_event->task_ctx;
+                if (!ctx) {
+                        ctxn = pmu->task_ctx_nr;
+                        if (ctxn < 0)
+                                goto next;
+                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                }
+                if (ctx)
+                        perf_event_task_ctx(ctx, task_event);
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
 }
@@ -3648,11 +4612,16 @@ static void perf_event_comm_output(struct perf_event *event,
                                     struct perf_comm_event *comm_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
-        int ret = perf_output_begin(&handle, event, size, 0, 0);
+        int ret;
+        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                comm_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3660,7 +4629,12 @@ static void perf_event_comm_output(struct perf_event *event,
        perf_output_put(&handle, comm_event->event_id);
        perf_output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        comm_event->event_id.header.size = size;
 }
 static int perf_event_comm_match(struct perf_event *event)
@@ -3668,7 +4642,7 @@ static int perf_event_comm_match(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if (event->attr.comm)
@@ -3692,8 +4666,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-        unsigned int size;
        char comm[TASK_COMM_LEN];
+        unsigned int size;
+        struct pmu *pmu;
+        int ctxn;
        memset(comm, 0, sizeof(comm));
        strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3703,23 +4679,39 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->comm_size = size;
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        ctx = rcu_dereference(current->perf_event_ctxp);
+                if (cpuctx->active_pmu != pmu)
-        if (ctx)
+                        goto next;
-                perf_event_comm_ctx(ctx, comm_event);
+                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-        put_cpu_var(perf_cpu_context);
+                ctxn = pmu->task_ctx_nr;
+                if (ctxn < 0)
+                        goto next;
+                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                if (ctx)
+                        perf_event_comm_ctx(ctx, comm_event);
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
 }
 void perf_event_comm(struct task_struct *task)
 {
        struct perf_comm_event comm_event;
+        struct perf_event_context *ctx;
+        int ctxn;
+        for_each_task_context_nr(ctxn) {
+                ctx = task->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
-        if (task->perf_event_ctxp)
+                perf_event_enable_on_exec(ctx);
-                perf_event_enable_on_exec(task);
+        }
        if (!atomic_read(&nr_comm_events))
                return;
@@ -3767,11 +4759,15 @@ static void perf_event_mmap_output(struct perf_event *event,
                                     struct perf_mmap_event *mmap_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
-        int ret = perf_output_begin(&handle, event, size, 0, 0);
+        int ret;
+        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                mmap_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -3779,7 +4775,12 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_output_put(&handle, mmap_event->event_id);
        perf_output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        mmap_event->event_id.header.size = size;
 }
 static int perf_event_mmap_match(struct perf_event *event,
@@ -3789,7 +4790,7 @@ static int perf_event_mmap_match(struct perf_event *event,
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if ((!executable && event->attr.mmap_data) ||
@@ -3821,6 +4822,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        char tmp[16];
        char *buf = NULL;
        const char *name;
+        struct pmu *pmu;
+        int ctxn;
        memset(tmp, 0, sizeof(tmp));
@@ -3873,12 +4876,25 @@ got_name:
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
        rcu_read_lock();
-        cpuctx = &get_cpu_var(perf_cpu_context);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
+                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-        ctx = rcu_dereference(current->perf_event_ctxp);
+                if (cpuctx->active_pmu != pmu)
-        if (ctx)
+                        goto next;
-                perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
+                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
-        put_cpu_var(perf_cpu_context);
+                                        vma->vm_flags & VM_EXEC);
+                ctxn = pmu->task_ctx_nr;
+                if (ctxn < 0)
+                        goto next;
+                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                if (ctx) {
+                        perf_event_mmap_ctx(ctx, mmap_event,
+                                        vma->vm_flags & VM_EXEC);
+                }
+next:
+                put_cpu_ptr(pmu->pmu_cpu_context);
+        }
        rcu_read_unlock();
        kfree(buf);
@@ -3919,6 +4935,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
 static void perf_log_throttle(struct perf_event *event, int enable)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int ret;
        struct {
@@ -3940,11 +4957,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
-        ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+        perf_event_header__init_id(&throttle_event.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                throttle_event.header.size, 1, 0);
        if (ret)
                return;
        perf_output_put(&handle, throttle_event);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
 }
@@ -3960,28 +4981,21 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
-        throttle = (throttle && event->pmu->unthrottle != NULL);
+        /*
+         * Non-sampling counters might still use the PMI to fold short
+         * hardware counters, ignore those.
+         */
+        if (unlikely(!is_sampling_event(event)))
+                return 0;
-        if (!throttle) {
+        if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
-                hwc->interrupts++;
+                if (throttle) {
-        } else {
+                        hwc->interrupts = MAX_INTERRUPTS;
-                if (hwc->interrupts != MAX_INTERRUPTS) {
+                        perf_log_throttle(event, 0);
-                        hwc->interrupts++;
-                        if (HZ * hwc->interrupts >
-                                        (u64)sysctl_perf_event_sample_rate) {
-                                hwc->interrupts = MAX_INTERRUPTS;
-                                perf_log_throttle(event, 0);
-                                ret = 1;
-                        }
-                } else {
-                        /*
-                         * Keep re-disabling events even though on the previous
-                         * pass we disabled it - just in case we raced with a
-                         * sched-in and the event got enabled again:
-                         */
                        ret = 1;
                }
-        }
+        } else
+                hwc->interrupts++;
        if (event->attr.freq) {
                u64 now = perf_clock();
@@ -4004,8 +5018,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                event->pending_kill = POLL_HUP;
                if (nmi) {
                        event->pending_disable = 1;
-                        perf_pending_queue(&event->pending,
+                        irq_work_queue(&event->pending);
-                                           perf_pending_event);
                } else
                        perf_event_disable(event);
        }
@@ -4015,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        else
                perf_event_output(event, nmi, data, regs);
+        if (event->fasync && event->pending_kill) {
+                if (nmi) {
+                        event->pending_wakeup = 1;
+                        irq_work_queue(&event->pending);
+                } else
+                        perf_event_wakeup(event);
+        }
        return ret;
 }
@@ -4029,6 +5050,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
 * Generic software event infrastructure
 */
+struct swevent_htable {
+        struct swevent_hlist            *swevent_hlist;
+        struct mutex                    hlist_mutex;
+        int                             hlist_refcount;
+        /* Recursion avoidance in each contexts */
+        int                             recursion[PERF_NR_CONTEXTS];
+};
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 /*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
@@ -4086,7 +5118,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        }
 }
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
                               int nmi, struct perf_sample_data *data,
                               struct pt_regs *regs)
 {
@@ -4097,7 +5129,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        if (!regs)
                return;
-        if (!hwc->sample_period)
+        if (!is_sampling_event(event))
                return;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4112,6 +5144,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 1;
        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;
@@ -4158,11 +5193,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 /* For the read side: events when they trigger */
 static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
 {
        struct swevent_hlist *hlist;
-        hlist = rcu_dereference(ctx->swevent_hlist);
+        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;
@@ -4171,7 +5206,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 /* For the event head insertion and removal in the hlist */
 static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 {
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
@@ -4182,7 +5217,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
-        hlist = rcu_dereference_protected(ctx->swevent_hlist,
+        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;
@@ -4195,23 +5230,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-        struct perf_cpu_context *cpuctx;
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct perf_event *event;
        struct hlist_node *node;
        struct hlist_head *head;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
        rcu_read_lock();
+        head = find_swevent_head_rcu(swhash, type, event_id);
-        head = find_swevent_head_rcu(cpuctx, type, event_id);
        if (!head)
                goto end;
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
-                        perf_swevent_add(event, nr, nmi, data, regs);
+                        perf_swevent_event(event, nr, nmi, data, regs);
        }
 end:
        rcu_read_unlock();
@@ -4219,33 +5250,17 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
-        int rctx;
-        if (in_nmi())
-                rctx = 3;
-        else if (in_irq())
-                rctx = 2;
-        else if (in_softirq())
-                rctx = 1;
-        else
-                rctx = 0;
-        if (cpuctx->recursion[rctx])
-                return -1;
-        cpuctx->recursion[rctx]++;
-        barrier();
-        return rctx;
+        return get_recursion_context(swhash->recursion);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
-        barrier();
-        cpuctx->recursion[rctx]--;
+        put_recursion_context(swhash->recursion, rctx);
 }
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4271,20 +5286,20 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
 {
+        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
-        struct perf_cpu_context *cpuctx;
        struct hlist_head *head;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
+        if (is_sampling_event(event)) {
-        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
-        head = find_swevent_head(cpuctx, event);
+        hwc->state = !(flags & PERF_EF_START);
+        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
@@ -4293,233 +5308,50 @@ static int perf_swevent_enable(struct perf_event *event)
        return 0;
 }
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
 {
        hlist_del_rcu(&event->hlist_entry);
 }
-static void perf_swevent_void(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
 {
+        event->hw.state = 0;
 }
-static int perf_swevent_int(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
 {
-        return 0;
+        event->hw.state = PERF_HES_STOPPED;
 }
-static const struct pmu perf_ops_generic = {
-        .enable         = perf_swevent_enable,
-        .disable        = perf_swevent_disable,
-        .start          = perf_swevent_int,
-        .stop           = perf_swevent_void,
-        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-/*
- * hrtimer based swevent callback
- */
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-        enum hrtimer_restart ret = HRTIMER_RESTART;
-        struct perf_sample_data data;
-        struct pt_regs *regs;
-        struct perf_event *event;
-        u64 period;
-        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-        event->pmu->read(event);
-        perf_sample_data_init(&data, 0);
-        data.period = event->hw.last_period;
-        regs = get_irq_regs();
-        if (regs && !perf_exclude_event(event, regs)) {
-                if (!(event->attr.exclude_idle && current->pid == 0))
-                        if (perf_event_overflow(event, 0, &data, regs))
-                                ret = HRTIMER_NORESTART;
-        }
-        period = max_t(u64, 10000, event->hw.sample_period);
-        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-        return ret;
-}
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                u64 period;
-                if (hwc->remaining) {
-                        if (hwc->remaining < 0)
-                                period = 10000;
-                        else
-                                period = hwc->remaining;
-                        hwc->remaining = 0;
-                } else {
-                        period = max_t(u64, 10000, hwc->sample_period);
-                }
-                __hrtimer_start_range_ns(&hwc->hrtimer,
-                                ns_to_ktime(period), 0,
-                                HRTIMER_MODE_REL, 0);
-        }
-}
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        if (hwc->sample_period) {
-                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-                hwc->remaining = ktime_to_ns(remaining);
-                hrtimer_cancel(&hwc->hrtimer);
-        }
-}
-/*
- * Software event: cpu wall time clock
- */
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-        int cpu = raw_smp_processor_id();
-        s64 prev;
-        u64 now;
-        now = cpu_clock(cpu);
-        prev = local64_xchg(&event->hw.prev_count, now);
-        local64_add(now - prev, &event->count);
-}
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        int cpu = raw_smp_processor_id();
-        local64_set(&hwc->prev_count, cpu_clock(cpu));
-        perf_swevent_start_hrtimer(event);
-        return 0;
-}
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-        perf_swevent_cancel_hrtimer(event);
-        cpu_clock_perf_event_update(event);
-}
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-        cpu_clock_perf_event_update(event);
-}
-static const struct pmu perf_ops_cpu_clock = {
-        .enable         = cpu_clock_perf_event_enable,
-        .disable        = cpu_clock_perf_event_disable,
-        .read           = cpu_clock_perf_event_read,
-};
-/*
- * Software event: task time clock
- */
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-        u64 prev;
-        s64 delta;
-        prev = local64_xchg(&event->hw.prev_count, now);
-        delta = now - prev;
-        local64_add(delta, &event->count);
-}
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-        struct hw_perf_event *hwc = &event->hw;
-        u64 now;
-        now = event->ctx->time;
-        local64_set(&hwc->prev_count, now);
-        perf_swevent_start_hrtimer(event);
-        return 0;
-}
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-        perf_swevent_cancel_hrtimer(event);
-        task_clock_perf_event_update(event, event->ctx->time);
-}
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-        u64 time;
-        if (!in_nmi()) {
-                update_context_time(event->ctx);
-                time = event->ctx->time;
-        } else {
-                u64 now = perf_clock();
-                u64 delta = now - event->ctx->timestamp;
-                time = event->ctx->time + delta;
-        }
-        task_clock_perf_event_update(event, time);
-}
-static const struct pmu perf_ops_task_clock = {
-        .enable         = task_clock_perf_event_enable,
-        .disable        = task_clock_perf_event_disable,
-        .read           = task_clock_perf_event_read,
-};
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
-{
-        return rcu_dereference_protected(cpuctx->swevent_hlist,
-                                         lockdep_is_held(&cpuctx->hlist_mutex));
-}
-static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
-        struct swevent_hlist *hlist;
+        return rcu_dereference_protected(swhash->swevent_hlist,
+                                         lockdep_is_held(&swhash->hlist_mutex));
-        hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
-        kfree(hlist);
 }
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
 {
-        struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
        if (!hlist)
                return;
-        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+        rcu_assign_pointer(swhash->swevent_hlist, NULL);
-        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+        kfree_rcu(hlist, rcu_head);
 }
 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        if (!--cpuctx->hlist_refcount)
+        if (!--swhash->hlist_refcount)
-                swevent_hlist_release(cpuctx);
+                swevent_hlist_release(swhash);
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
 }
 static void swevent_hlist_put(struct perf_event *event)
@@ -4537,12 +5369,12 @@ static void swevent_hlist_put(struct perf_event *event)
 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4550,11 +5382,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                        err = -ENOMEM;
                        goto exit;
                }
-                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-        cpuctx->hlist_refcount++;
+        swhash->hlist_refcount++;
- exit:
+exit:
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
        return err;
 }
@@ -4578,7 +5410,7 @@ static int swevent_hlist_get(struct perf_event *event)
        put_online_cpus();
        return 0;
- fail:
+fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
@@ -4589,17 +5421,64 @@ static int swevent_hlist_get(struct perf_event *event)
        return err;
 }
-#ifdef CONFIG_EVENT_TRACING
+struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+        u64 event_id = event->attr.config;
+        WARN_ON(event->parent);
+        jump_label_dec(&perf_swevent_enabled[event_id]);
+        swevent_hlist_put(event);
+}
+static int perf_swevent_init(struct perf_event *event)
+{
+        int event_id = event->attr.config;
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        switch (event_id) {
+        case PERF_COUNT_SW_CPU_CLOCK:
+        case PERF_COUNT_SW_TASK_CLOCK:
+                return -ENOENT;
+        default:
+                break;
+        }
+        if (event_id >= PERF_COUNT_SW_MAX)
+                return -ENOENT;
+        if (!event->parent) {
+                int err;
+                err = swevent_hlist_get(event);
+                if (err)
+                        return err;
+                jump_label_inc(&perf_swevent_enabled[event_id]);
+                event->destroy = sw_perf_event_destroy;
+        }
-static const struct pmu perf_ops_tracepoint = {
+        return 0;
-        .enable         = perf_trace_enable,
+}
-        .disable        = perf_trace_disable,
-        .start          = perf_swevent_int,
+static struct pmu perf_swevent = {
-        .stop           = perf_swevent_void,
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = perf_swevent_init,
+        .add            = perf_swevent_add,
+        .del            = perf_swevent_del,
+        .start          = perf_swevent_start,
+        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_void,
 };
+#ifdef CONFIG_EVENT_TRACING
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
@@ -4614,6 +5493,8 @@ static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        /*
         * All tracepoints are from kernel-space.
         */
@@ -4643,7 +5524,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
-                        perf_swevent_add(event, count, 1, &data, regs);
+                        perf_swevent_event(event, count, 1, &data, regs);
        }
        perf_swevent_put_recursion_context(rctx);
@@ -4655,26 +5536,36 @@ static void tp_perf_event_destroy(struct perf_event *event)
        perf_trace_destroy(event);
 }
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
 {
        int err;
-        /*
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
-         * Raw tracepoint data is a severe data leak, only allow root to
+                return -ENOENT;
-         * have these.
-         */
-        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
-                        perf_paranoid_tracepoint_raw() &&
-                        !capable(CAP_SYS_ADMIN))
-                return ERR_PTR(-EPERM);
        err = perf_trace_init(event);
        if (err)
-                return NULL;
+                return err;
        event->destroy = tp_perf_event_destroy;
-        return &perf_ops_tracepoint;
+        return 0;
+}
+static struct pmu perf_tracepoint = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = perf_tp_event_init,
+        .add            = perf_trace_add,
+        .del            = perf_trace_del,
+        .start          = perf_swevent_start,
+        .stop           = perf_swevent_stop,
+        .read           = perf_swevent_read,
+};
+static inline void perf_tp_register(void)
+{
+        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4702,9 +5593,8 @@ static void perf_event_free_filter(struct perf_event *event)
 #else
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
 {
-        return NULL;
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4719,105 +5609,535 @@ static void perf_event_free_filter(struct perf_event *event)
 #endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-        release_bp_slot(event);
+        struct perf_sample_data sample;
+        struct pt_regs *regs = data;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        if (!bp->hw.state && !perf_exclude_event(bp, regs))
+                perf_swevent_event(bp, 1, 1, &sample, regs);
 }
+#endif
+/*
+ * hrtimer based swevent callback
+ */
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-        int err;
+        enum hrtimer_restart ret = HRTIMER_RESTART;
+        struct perf_sample_data data;
+        struct pt_regs *regs;
+        struct perf_event *event;
+        u64 period;
-        err = register_perf_hw_breakpoint(bp);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-        if (err)
-                return ERR_PTR(err);
+        if (event->state != PERF_EVENT_STATE_ACTIVE)
+                return HRTIMER_NORESTART;
+        event->pmu->read(event);
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
+        regs = get_irq_regs();
+        if (regs && !perf_exclude_event(event, regs)) {
+                if (!(event->attr.exclude_idle && current->pid == 0))
+                        if (perf_event_overflow(event, 0, &data, regs))
+                                ret = HRTIMER_NORESTART;
+        }
-        bp->destroy = bp_perf_event_destroy;
+        period = max_t(u64, 10000, event->hw.sample_period);
+        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-        return &perf_ops_bp;
+        return ret;
 }
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
-        struct perf_sample_data sample;
+        struct hw_perf_event *hwc = &event->hw;
-        struct pt_regs *regs = data;
+        s64 period;
-        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        if (!is_sampling_event(event))
+                return;
-        if (!perf_exclude_event(bp, regs))
+        period = local64_read(&hwc->period_left);
-                perf_swevent_add(bp, 1, 1, &sample, regs);
+        if (period) {
+                if (period < 0)
+                        period = 10000;
+                local64_set(&hwc->period_left, 0);
+        } else {
+                period = max_t(u64, 10000, hwc->sample_period);
+        }
+        __hrtimer_start_range_ns(&hwc->hrtimer,
+                                ns_to_ktime(period), 0,
+                                HRTIMER_MODE_REL_PINNED, 0);
 }
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        if (is_sampling_event(event)) {
+                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+                local64_set(&hwc->period_left, ktime_to_ns(remaining));
+                hrtimer_cancel(&hwc->hrtimer);
+        }
+}
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        if (!is_sampling_event(event))
+                return;
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swevent_hrtimer;
+        /*
+         * Since hrtimers have a fixed rate, we can do a static freq->period
+         * mapping and avoid the whole period adjust feedback stuff.
+         */
+        if (event->attr.freq) {
+                long freq = event->attr.sample_freq;
+                event->attr.sample_period = NSEC_PER_SEC / freq;
+                hwc->sample_period = event->attr.sample_period;
+                local64_set(&hwc->period_left, hwc->sample_period);
+                event->attr.freq = 0;
+        }
+}
+/*
+ * Software event: cpu wall time clock
+ */
+static void cpu_clock_event_update(struct perf_event *event)
+{
+        s64 prev;
+        u64 now;
+        now = local_clock();
+        prev = local64_xchg(&event->hw.prev_count, now);
+        local64_add(now - prev, &event->count);
+}
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+        local64_set(&event->hw.prev_count, local_clock());
+        perf_swevent_start_hrtimer(event);
+}
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+        perf_swevent_cancel_hrtimer(event);
+        cpu_clock_event_update(event);
+}
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+        if (flags & PERF_EF_START)
+                cpu_clock_event_start(event, flags);
+        return 0;
+}
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+        cpu_clock_event_stop(event, flags);
+}
+static void cpu_clock_event_read(struct perf_event *event)
+{
+        cpu_clock_event_update(event);
+}
+static int cpu_clock_event_init(struct perf_event *event)
+{
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+                return -ENOENT;
+        perf_swevent_init_hrtimer(event);
+        return 0;
+}
+static struct pmu perf_cpu_clock = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = cpu_clock_event_init,
+        .add            = cpu_clock_event_add,
+        .del            = cpu_clock_event_del,
+        .start          = cpu_clock_event_start,
+        .stop           = cpu_clock_event_stop,
+        .read           = cpu_clock_event_read,
+};
+/*
+ * Software event: task time clock
+ */
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+        u64 prev;
+        s64 delta;
+        prev = local64_xchg(&event->hw.prev_count, now);
+        delta = now - prev;
+        local64_add(delta, &event->count);
+}
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+        local64_set(&event->hw.prev_count, event->ctx->time);
+        perf_swevent_start_hrtimer(event);
+}
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+        perf_swevent_cancel_hrtimer(event);
+        task_clock_event_update(event, event->ctx->time);
+}
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+        if (flags & PERF_EF_START)
+                task_clock_event_start(event, flags);
+        return 0;
+}
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+        task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+static void task_clock_event_read(struct perf_event *event)
+{
+        u64 now = perf_clock();
+        u64 delta = now - event->ctx->timestamp;
+        u64 time = event->ctx->time + delta;
+        task_clock_event_update(event, time);
+}
+static int task_clock_event_init(struct perf_event *event)
+{
+        if (event->attr.type != PERF_TYPE_SOFTWARE)
+                return -ENOENT;
+        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+                return -ENOENT;
+        perf_swevent_init_hrtimer(event);
+        return 0;
+}
+static struct pmu perf_task_clock = {
+        .task_ctx_nr    = perf_sw_context,
+        .event_init     = task_clock_event_init,
+        .add            = task_clock_event_add,
+        .del            = task_clock_event_del,
+        .start          = task_clock_event_start,
+        .stop           = task_clock_event_stop,
+        .read           = task_clock_event_read,
+};
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+        return 0;
+}
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+        perf_pmu_disable(pmu);
+}
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+        perf_pmu_enable(pmu);
+        return 0;
+}
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+        perf_pmu_enable(pmu);
+}
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
 {
+        struct pmu *pmu;
+        if (ctxn < 0)
+                return NULL;
+        list_for_each_entry(pmu, &pmus, entry) {
+                if (pmu->task_ctx_nr == ctxn)
+                        return pmu->pmu_cpu_context;
+        }
        return NULL;
 }
-void perf_bp_event(struct perf_event *bp, void *regs)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 {
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                if (cpuctx->active_pmu == old_pmu)
+                        cpuctx->active_pmu = pmu;
+        }
 }
-#endif
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void free_pmu_context(struct pmu *pmu)
+{
+        struct pmu *i;
-static void sw_perf_event_destroy(struct perf_event *event)
+        mutex_lock(&pmus_lock);
+        /*
+         * Like a real lame refcount.
+         */
+        list_for_each_entry(i, &pmus, entry) {
+                if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+                        update_pmu_context(i, pmu);
+                        goto out;
+                }
+        }
+        free_percpu(pmu->pmu_cpu_context);
+out:
+        mutex_unlock(&pmus_lock);
+}
+static struct idr pmu_idr;
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
 {
-        u64 event_id = event->attr.config;
+        struct pmu *pmu = dev_get_drvdata(dev);
-        WARN_ON(event->parent);
+        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
-        atomic_dec(&perf_swevent_enabled[event_id]);
+static struct device_attribute pmu_dev_attrs[] = {
-        swevent_hlist_put(event);
+       __ATTR_RO(type),
+       __ATTR_NULL,
+};
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+        .name           = "event_source",
+        .dev_attrs      = pmu_dev_attrs,
+};
+static void pmu_dev_release(struct device *dev)
+{
+        kfree(dev);
 }
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static int pmu_dev_alloc(struct pmu *pmu)
 {
-        const struct pmu *pmu = NULL;
+        int ret = -ENOMEM;
-        u64 event_id = event->attr.config;
+        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+        if (!pmu->dev)
+                goto out;
+        device_initialize(pmu->dev);
+        ret = dev_set_name(pmu->dev, "%s", pmu->name);
+        if (ret)
+                goto free_dev;
+        dev_set_drvdata(pmu->dev, pmu);
+        pmu->dev->bus = &pmu_bus;
+        pmu->dev->release = pmu_dev_release;
+        ret = device_add(pmu->dev);
+        if (ret)
+                goto free_dev;
+out:
+        return ret;
+free_dev:
+        put_device(pmu->dev);
+        goto out;
+}
+static struct lock_class_key cpuctx_mutex;
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
+{
+        int cpu, ret;
+        mutex_lock(&pmus_lock);
+        ret = -ENOMEM;
+        pmu->pmu_disable_count = alloc_percpu(int);
+        if (!pmu->pmu_disable_count)
+                goto unlock;
+        pmu->type = -1;
+        if (!name)
+                goto skip_type;
+        pmu->name = name;
+        if (type < 0) {
+                int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+                if (!err)
+                        goto free_pdc;
+                err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+                if (err) {
+                        ret = err;
+                        goto free_pdc;
+                }
+        }
+        pmu->type = type;
+        if (pmu_bus_running) {
+                ret = pmu_dev_alloc(pmu);
+                if (ret)
+                        goto free_idr;
+        }
+skip_type:
+        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+        if (pmu->pmu_cpu_context)
+                goto got_cpu_context;
+        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+        if (!pmu->pmu_cpu_context)
+                goto free_dev;
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                __perf_event_init_context(&cpuctx->ctx);
+                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+                cpuctx->ctx.type = cpu_context;
+                cpuctx->ctx.pmu = pmu;
+                cpuctx->jiffies_interval = 1;
+                INIT_LIST_HEAD(&cpuctx->rotation_list);
+                cpuctx->active_pmu = pmu;
+        }
+got_cpu_context:
+        if (!pmu->start_txn) {
+                if (pmu->pmu_enable) {
+                        /*
+                         * If we have pmu_enable/pmu_disable calls, install
+                         * transaction stubs that use that to try and batch
+                         * hardware accesses.
+                         */
+                        pmu->start_txn  = perf_pmu_start_txn;
+                        pmu->commit_txn = perf_pmu_commit_txn;
+                        pmu->cancel_txn = perf_pmu_cancel_txn;
+                } else {
+                        pmu->start_txn  = perf_pmu_nop_void;
+                        pmu->commit_txn = perf_pmu_nop_int;
+                        pmu->cancel_txn = perf_pmu_nop_void;
+                }
+        }
+        if (!pmu->pmu_enable) {
+                pmu->pmu_enable  = perf_pmu_nop_void;
+                pmu->pmu_disable = perf_pmu_nop_void;
+        }
+        list_add_rcu(&pmu->entry, &pmus);
+        ret = 0;
+unlock:
+        mutex_unlock(&pmus_lock);
+        return ret;
+free_dev:
+        device_del(pmu->dev);
+        put_device(pmu->dev);
+free_idr:
+        if (pmu->type >= PERF_TYPE_MAX)
+                idr_remove(&pmu_idr, pmu->type);
+free_pdc:
+        free_percpu(pmu->pmu_disable_count);
+        goto unlock;
+}
+void perf_pmu_unregister(struct pmu *pmu)
+{
+        mutex_lock(&pmus_lock);
+        list_del_rcu(&pmu->entry);
+        mutex_unlock(&pmus_lock);
        /*
-         * Software events (currently) can't in general distinguish
+         * We dereference the pmu list under both SRCU and regular RCU, so
-         * between user, kernel and hypervisor events.
+         * synchronize against both of those.
-         * However, context switches and cpu migrations are considered
-         * to be kernel events, and page faults are never hypervisor
-         * events.
         */
-        switch (event_id) {
+        synchronize_srcu(&pmus_srcu);
-        case PERF_COUNT_SW_CPU_CLOCK:
+        synchronize_rcu();
-                pmu = &perf_ops_cpu_clock;
-                break;
+        free_percpu(pmu->pmu_disable_count);
-        case PERF_COUNT_SW_TASK_CLOCK:
+        if (pmu->type >= PERF_TYPE_MAX)
-                /*
+                idr_remove(&pmu_idr, pmu->type);
-                 * If the user instantiates this as a per-cpu event,
+        device_del(pmu->dev);
-                 * use the cpu_clock event instead.
+        put_device(pmu->dev);
-                 */
+        free_pmu_context(pmu);
-                if (event->ctx->task)
+}
-                        pmu = &perf_ops_task_clock;
-                else
-                        pmu = &perf_ops_cpu_clock;
-                break;
+struct pmu *perf_init_event(struct perf_event *event)
-        case PERF_COUNT_SW_PAGE_FAULTS:
+{
-        case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+        struct pmu *pmu = NULL;
-        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+        int idx;
-        case PERF_COUNT_SW_CONTEXT_SWITCHES:
+        int ret;
-        case PERF_COUNT_SW_CPU_MIGRATIONS:
-        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+        idx = srcu_read_lock(&pmus_srcu);
-        case PERF_COUNT_SW_EMULATION_FAULTS:
-                if (!event->parent) {
+        rcu_read_lock();
-                        int err;
+        pmu = idr_find(&pmu_idr, event->attr.type);
+        rcu_read_unlock();
-                        err = swevent_hlist_get(event);
+        if (pmu) {
-                        if (err)
+                ret = pmu->event_init(event);
-                                return ERR_PTR(err);
+                if (ret)
+                        pmu = ERR_PTR(ret);
+                goto unlock;
+        }
-                        atomic_inc(&perf_swevent_enabled[event_id]);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                        event->destroy = sw_perf_event_destroy;
+                ret = pmu->event_init(event);
+                if (!ret)
+                        goto unlock;
+                if (ret != -ENOENT) {
+                        pmu = ERR_PTR(ret);
+                        goto unlock;
                }
-                pmu = &perf_ops_generic;
-                break;
        }
+        pmu = ERR_PTR(-ENOENT);
+unlock:
+        srcu_read_unlock(&pmus_srcu, idx);
        return pmu;
 }
@@ -4826,20 +6146,23 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 * Allocate and initialize a event structure
 */
 static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
-                   int cpu,
+                 struct task_struct *task,
-                   struct perf_event_context *ctx,
+                 struct perf_event *group_leader,
-                   struct perf_event *group_leader,
+                 struct perf_event *parent_event,
-                   struct perf_event *parent_event,
+                 perf_overflow_handler_t overflow_handler)
-                   perf_overflow_handler_t overflow_handler,
+{
-                   gfp_t gfpflags)
+        struct pmu *pmu;
-{
-        const struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err;
-        event = kzalloc(sizeof(*event), gfpflags);
+        if ((unsigned)cpu >= nr_cpu_ids) {
+                if (!task || cpu != -1)
+                        return ERR_PTR(-EINVAL);
+        }
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);
@@ -4857,6 +6180,7 @@ perf_event_alloc(struct perf_event_attr *attr,
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        init_waitqueue_head(&event->waitq);
+        init_irq_work(&event->pending, perf_pending_event);
        mutex_init(&event->mmap_mutex);
@@ -4864,7 +6188,6 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->attr             = *attr;
        event->group_leader     = group_leader;
        event->pmu              = NULL;
-        event->ctx              = ctx;
        event->oncpu            = -1;
        event->parent           = parent_event;
@@ -4874,9 +6197,20 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
+        if (task) {
+                event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+                /*
+                 * hw_breakpoint is a bit difficult here..
+                 */
+                if (attr->type == PERF_TYPE_BREAKPOINT)
+                        event->hw.bp_target = task;
+#endif
+        }
        if (!overflow_handler && parent_event)
                overflow_handler = parent_event->overflow_handler;
-        
        event->overflow_handler = overflow_handler;
        if (attr->disabled)
@@ -4898,29 +6232,8 @@ perf_event_alloc(struct perf_event_attr *attr,
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto done;
-        switch (attr->type) {
+        pmu = perf_init_event(event);
-        case PERF_TYPE_RAW:
-        case PERF_TYPE_HARDWARE:
-        case PERF_TYPE_HW_CACHE:
-                pmu = hw_perf_event_init(event);
-                break;
-        case PERF_TYPE_SOFTWARE:
-                pmu = sw_perf_event_init(event);
-                break;
-        case PERF_TYPE_TRACEPOINT:
-                pmu = tp_perf_event_init(event);
-                break;
-        case PERF_TYPE_BREAKPOINT:
-                pmu = bp_perf_event_init(event);
-                break;
-        default:
-                break;
-        }
 done:
        err = 0;
        if (!pmu)
@@ -4938,13 +6251,21 @@ done:
        event->pmu = pmu;
        if (!event->parent) {
-                atomic_inc(&nr_events);
+                if (event->attach_state & PERF_ATTACH_TASK)
+                        jump_label_inc(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
                        atomic_inc(&nr_comm_events);
                if (event->attr.task)
                        atomic_inc(&nr_task_events);
+                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                        err = get_callchain_buffers();
+                        if (err) {
+                                free_event(event);
+                                return ERR_PTR(err);
+                        }
+                }
        }
        return event;
@@ -5092,17 +6413,21 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-        struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+        struct perf_event *group_leader = NULL, *output_event = NULL;
+        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct file *group_file = NULL;
+        struct task_struct *task = NULL;
+        struct pmu *pmu;
        int event_fd;
+        int move_group = 0;
        int fput_needed = 0;
        int err;
        /* for future expandability... */
-        if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;
        err = perf_copy_attr(attr_uptr, &attr);
@@ -5119,24 +6444,24 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
+        /*
+         * In cgroup mode, the pid argument is used to pass the fd
+         * opened to the cgroup directory in cgroupfs. The cpu argument
+         * designates the cpu on which to monitor threads from that
+         * cgroup.
+         */
+        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+                return -EINVAL;
        event_fd = get_unused_fd_flags(O_RDWR);
        if (event_fd < 0)
                return event_fd;
-        /*
-         * Get the target context (task or percpu):
-         */
-        ctx = find_get_context(pid, cpu);
-        if (IS_ERR(ctx)) {
-                err = PTR_ERR(ctx);
-                goto err_fd;
-        }
        if (group_fd != -1) {
                group_leader = perf_fget_light(group_fd, &fput_needed);
                if (IS_ERR(group_leader)) {
                        err = PTR_ERR(group_leader);
-                        goto err_put_context;
+                        goto err_fd;
                }
                group_file = group_leader->filp;
                if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5145,6 +6470,76 @@ SYSCALL_DEFINE5(perf_event_open,
                        group_leader = NULL;
        }
+        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
+                task = find_lively_task_by_vpid(pid);
+                if (IS_ERR(task)) {
+                        err = PTR_ERR(task);
+                        goto err_group_fd;
+                }
+        }
+        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+        if (IS_ERR(event)) {
+                err = PTR_ERR(event);
+                goto err_task;
+        }
+        if (flags & PERF_FLAG_PID_CGROUP) {
+                err = perf_cgroup_connect(pid, event, &attr, group_leader);
+                if (err)
+                        goto err_alloc;
+                /*
+                 * one more event:
+                 * - that has cgroup constraint on event->cpu
+                 * - that may need work on context switch
+                 */
+                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+                jump_label_inc(&perf_sched_events);
+        }
+        /*
+         * Special case software events and allow them to be part of
+         * any hardware group.
+         */
+        pmu = event->pmu;
+        if (group_leader &&
+            (is_software_event(event) != is_software_event(group_leader))) {
+                if (is_software_event(event)) {
+                        /*
+                         * If event and group_leader are not both a software
+                         * event, and event is, then group leader is not.
+                         *
+                         * Allow the addition of software events to !software
+                         * groups, this is safe because software events never
+                         * fail to schedule.
+                         */
+                        pmu = group_leader->pmu;
+                } else if (is_software_event(group_leader) &&
+                           (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                        /*
+                         * In case the group is a pure software group, and we
+                         * try to add a hardware event, move the whole group to
+                         * the hardware context.
+                         */
+                        move_group = 1;
+                }
+        }
+        /*
+         * Get the target context (task or percpu):
+         */
+        ctx = find_get_context(pmu, task, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_alloc;
+        }
+        if (task) {
+                put_task_struct(task);
+                task = NULL;
+        }
        /*
         * Look up the group leader (we will attach this event to it):
         */
@@ -5156,53 +6551,84 @@ SYSCALL_DEFINE5(perf_event_open,
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
-                        goto err_put_context;
+                        goto err_context;
                /*
                 * Do not allow to attach to a group in a different
                 * task or CPU context:
                 */
-                if (group_leader->ctx != ctx)
+                if (move_group) {
-                        goto err_put_context;
+                        if (group_leader->ctx->type != ctx->type)
+                                goto err_context;
+                } else {
+                        if (group_leader->ctx != ctx)
+                                goto err_context;
+                }
                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
-                        goto err_put_context;
+                        goto err_context;
-        }
-        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                     NULL, NULL, GFP_KERNEL);
-        if (IS_ERR(event)) {
-                err = PTR_ERR(event);
-                goto err_put_context;
        }
        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
-                        goto err_free_put_context;
+                        goto err_context;
        }
        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
-                goto err_free_put_context;
+                goto err_context;
+        }
+        if (move_group) {
+                struct perf_event_context *gctx = group_leader->ctx;
+                mutex_lock(&gctx->mutex);
+                perf_remove_from_context(group_leader);
+                list_for_each_entry(sibling, &group_leader->sibling_list,
+                                    group_entry) {
+                        perf_remove_from_context(sibling);
+                        put_ctx(gctx);
+                }
+                mutex_unlock(&gctx->mutex);
+                put_ctx(gctx);
        }
        event->filp = event_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
+        if (move_group) {
+                perf_install_in_context(ctx, group_leader, cpu);
+                get_ctx(ctx);
+                list_for_each_entry(sibling, &group_leader->sibling_list,
+                                    group_entry) {
+                        perf_install_in_context(ctx, sibling, cpu);
+                        get_ctx(ctx);
+                }
+        }
        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
+        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
        event->owner = current;
-        get_task_struct(current);
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(event);
+        perf_event__id_header_size(event);
+        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
@@ -5212,11 +6638,16 @@ SYSCALL_DEFINE5(perf_event_open,
        fd_install(event_fd, event_file);
        return event_fd;
-err_free_put_context:
+err_context:
+        perf_unpin_context(ctx);
+        put_ctx(ctx);
+err_alloc:
        free_event(event);
-err_put_context:
+err_task:
+        if (task)
+                put_task_struct(task);
+err_group_fd:
        fput_light(group_file, fput_needed);
-        put_ctx(ctx);
 err_fd:
        put_unused_fd(event_fd);
        return err;
@@ -5227,32 +6658,31 @@ err_fd:
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
 */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                 pid_t pid,
+                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler)
 {
-        struct perf_event *event;
        struct perf_event_context *ctx;
+        struct perf_event *event;
        int err;
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pid, cpu);
+        event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
-        if (IS_ERR(ctx)) {
-                err = PTR_ERR(ctx);
-                goto err_exit;
-        }
-        event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                 NULL, overflow_handler, GFP_KERNEL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
-                goto err_put_context;
+                goto err;
+        }
+        ctx = find_get_context(event->pmu, task, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_free;
        }
        event->filp = NULL;
@@ -5260,122 +6690,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
+        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
-        event->owner = current;
-        get_task_struct(current);
-        mutex_lock(&current->perf_event_mutex);
-        list_add_tail(&event->owner_entry, &current->perf_event_list);
-        mutex_unlock(&current->perf_event_mutex);
        return event;
- err_put_context:
+err_free:
-        put_ctx(ctx);
+        free_event(event);
- err_exit:
+err:
        return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-              struct task_struct *parent,
-              struct perf_event_context *parent_ctx,
-              struct task_struct *child,
-              struct perf_event *group_leader,
-              struct perf_event_context *child_ctx)
-{
-        struct perf_event *child_event;
-        /*
-         * Instead of creating recursive hierarchies of events,
-         * we link inherited events back to the original parent,
-         * which has a filp for sure, which we use as the reference
-         * count:
-         */
-        if (parent_event->parent)
-                parent_event = parent_event->parent;
-        child_event = perf_event_alloc(&parent_event->attr,
-                                           parent_event->cpu, child_ctx,
-                                           group_leader, parent_event,
-                                           NULL, GFP_KERNEL);
-        if (IS_ERR(child_event))
-                return child_event;
-        get_ctx(child_ctx);
-        /*
-         * Make the child state follow the state of the parent event,
-         * not its attr.disabled bit.  We hold the parent's mutex,
-         * so we won't race with perf_event_{en, dis}able_family.
-         */
-        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-                child_event->state = PERF_EVENT_STATE_INACTIVE;
-        else
-                child_event->state = PERF_EVENT_STATE_OFF;
-        if (parent_event->attr.freq) {
-                u64 sample_period = parent_event->hw.sample_period;
-                struct hw_perf_event *hwc = &child_event->hw;
-                hwc->sample_period = sample_period;
-                hwc->last_period   = sample_period;
-                local64_set(&hwc->period_left, sample_period);
-        }
-        child_event->overflow_handler = parent_event->overflow_handler;
-        /*
-         * Link it up in the child's context:
-         */
-        add_event_to_ctx(child_event, child_ctx);
-        /*
-         * Get a reference to the parent filp - we will fput it
-         * when the child event exits. This is safe to do because
-         * we are in the parent and we know that the filp still
-         * exists and has a nonzero count:
-         */
-        atomic_long_inc(&parent_event->filp->f_count);
-        /*
-         * Link this into the parent event's child list
-         */
-        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-        mutex_lock(&parent_event->child_mutex);
-        list_add_tail(&child_event->child_list, &parent_event->child_list);
-        mutex_unlock(&parent_event->child_mutex);
-        return child_event;
-}
-static int inherit_group(struct perf_event *parent_event,
-              struct task_struct *parent,
-              struct perf_event_context *parent_ctx,
-              struct task_struct *child,
-              struct perf_event_context *child_ctx)
-{
-        struct perf_event *leader;
-        struct perf_event *sub;
-        struct perf_event *child_ctr;
-        leader = inherit_event(parent_event, parent, parent_ctx,
-                                 child, NULL, child_ctx);
-        if (IS_ERR(leader))
-                return PTR_ERR(leader);
-        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-                child_ctr = inherit_event(sub, parent, parent_ctx,
-                                            child, leader, child_ctx);
-                if (IS_ERR(child_ctr))
-                        return PTR_ERR(child_ctr);
-        }
-        return 0;
-}
 static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
 {
@@ -5416,32 +6742,32 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        struct perf_event *parent_event;
+        if (child_event->parent) {
+                raw_spin_lock_irq(&child_ctx->lock);
+                perf_group_detach(child_event);
+                raw_spin_unlock_irq(&child_ctx->lock);
+        }
-        perf_event_remove_from_context(child_event);
+        perf_remove_from_context(child_event);
-        parent_event = child_event->parent;
        /*
-         * It can happen that parent exits first, and has events
+         * It can happen that the parent exits first, and has events
         * that are still around due to the child reference. These
-         * events need to be zapped - but otherwise linger.
+         * events need to be zapped.
         */
-        if (parent_event) {
+        if (child_event->parent) {
                sync_child_event(child_event, child);
                free_event(child_event);
        }
 }
-/*
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
 {
        struct perf_event *child_event, *tmp;
        struct perf_event_context *child_ctx;
        unsigned long flags;
-        if (likely(!child->perf_event_ctxp)) {
+        if (likely(!child->perf_event_ctxp[ctxn])) {
                perf_event_task(child, NULL, 0);
                return;
        }
@@ -5453,8 +6779,8 @@ void perf_event_exit_task(struct task_struct *child)
         * scheduled, so we are now safe from rescheduling changing
         * our context.
         */
-        child_ctx = child->perf_event_ctxp;
+        child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
-        __perf_event_task_sched_out(child_ctx);
+        task_ctx_sched_out(child_ctx, EVENT_ALL);
        /*
         * Take the context lock here so that if find_get_context is
@@ -5462,7 +6788,7 @@ void perf_event_exit_task(struct task_struct *child)
         * incremented the context's refcount before we do put_ctx below.
         */
        raw_spin_lock(&child_ctx->lock);
-        child->perf_event_ctxp = NULL;
+        child->perf_event_ctxp[ctxn] = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
@@ -5515,6 +6841,33 @@ again:
        put_ctx(child_ctx);
 }
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+        struct perf_event *event, *tmp;
+        int ctxn;
+        mutex_lock(&child->perf_event_mutex);
+        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+                                 owner_entry) {
+                list_del_init(&event->owner_entry);
+                /*
+                 * Ensure the list deletion is visible before we clear
+                 * the owner, closes a race against perf_release() where
+                 * we need to serialize on the owner->perf_event_mutex.
+                 */
+                smp_wmb();
+                event->owner = NULL;
+        }
+        mutex_unlock(&child->perf_event_mutex);
+        for_each_task_context_nr(ctxn)
+                perf_event_exit_task_context(child, ctxn);
+}
 static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
 {
@@ -5536,48 +6889,172 @@ static void perf_free_event(struct perf_event *event,
 /*
 * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
 */
 void perf_event_free_task(struct task_struct *task)
 {
-        struct perf_event_context *ctx = task->perf_event_ctxp;
+        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
+        int ctxn;
-        if (!ctx)
+        for_each_task_context_nr(ctxn) {
-                return;
+                ctx = task->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
-        mutex_lock(&ctx->mutex);
+                mutex_lock(&ctx->mutex);
 again:
-        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+                list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
-                perf_free_event(event, ctx);
+                                group_entry)
+                        perf_free_event(event, ctx);
-        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+                list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                                 group_entry)
+                                group_entry)
-                perf_free_event(event, ctx);
+                        perf_free_event(event, ctx);
-        if (!list_empty(&ctx->pinned_groups) ||
+                if (!list_empty(&ctx->pinned_groups) ||
-            !list_empty(&ctx->flexible_groups))
+                                !list_empty(&ctx->flexible_groups))
-                goto again;
+                        goto again;
-        mutex_unlock(&ctx->mutex);
+                mutex_unlock(&ctx->mutex);
-        put_ctx(ctx);
+                put_ctx(ctx);
+        }
+}
+void perf_event_delayed_put(struct task_struct *task)
+{
+        int ctxn;
+        for_each_task_context_nr(ctxn)
+                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+              struct task_struct *parent,
+              struct perf_event_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_event *group_leader,
+              struct perf_event_context *child_ctx)
+{
+        struct perf_event *child_event;
+        unsigned long flags;
+        /*
+         * Instead of creating recursive hierarchies of events,
+         * we link inherited events back to the original parent,
+         * which has a filp for sure, which we use as the reference
+         * count:
+         */
+        if (parent_event->parent)
+                parent_event = parent_event->parent;
+        child_event = perf_event_alloc(&parent_event->attr,
+                                           parent_event->cpu,
+                                           child,
+                                           group_leader, parent_event,
+                                           NULL);
+        if (IS_ERR(child_event))
+                return child_event;
+        get_ctx(child_ctx);
+        /*
+         * Make the child state follow the state of the parent event,
+         * not its attr.disabled bit.  We hold the parent's mutex,
+         * so we won't race with perf_event_{en, dis}able_family.
+         */
+        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+                child_event->state = PERF_EVENT_STATE_INACTIVE;
+        else
+                child_event->state = PERF_EVENT_STATE_OFF;
+        if (parent_event->attr.freq) {
+                u64 sample_period = parent_event->hw.sample_period;
+                struct hw_perf_event *hwc = &child_event->hw;
+                hwc->sample_period = sample_period;
+                hwc->last_period   = sample_period;
+                local64_set(&hwc->period_left, sample_period);
+        }
+        child_event->ctx = child_ctx;
+        child_event->overflow_handler = parent_event->overflow_handler;
+        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(child_event);
+        perf_event__id_header_size(child_event);
+        /*
+         * Link it up in the child's context:
+         */
+        raw_spin_lock_irqsave(&child_ctx->lock, flags);
+        add_event_to_ctx(child_event, child_ctx);
+        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+        /*
+         * Get a reference to the parent filp - we will fput it
+         * when the child event exits. This is safe to do because
+         * we are in the parent and we know that the filp still
+         * exists and has a nonzero count:
+         */
+        atomic_long_inc(&parent_event->filp->f_count);
+        /*
+         * Link this into the parent event's child list
+         */
+        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+        mutex_lock(&parent_event->child_mutex);
+        list_add_tail(&child_event->child_list, &parent_event->child_list);
+        mutex_unlock(&parent_event->child_mutex);
+        return child_event;
+}
+static int inherit_group(struct perf_event *parent_event,
+              struct task_struct *parent,
+              struct perf_event_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_event_context *child_ctx)
+{
+        struct perf_event *leader;
+        struct perf_event *sub;
+        struct perf_event *child_ctr;
+        leader = inherit_event(parent_event, parent, parent_ctx,
+                                 child, NULL, child_ctx);
+        if (IS_ERR(leader))
+                return PTR_ERR(leader);
+        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+                child_ctr = inherit_event(sub, parent, parent_ctx,
+                                            child, leader, child_ctx);
+                if (IS_ERR(child_ctr))
+                        return PTR_ERR(child_ctr);
+        }
+        return 0;
 }
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
-                   struct task_struct *child,
+                   struct task_struct *child, int ctxn,
                   int *inherited_all)
 {
        int ret;
-        struct perf_event_context *child_ctx = child->perf_event_ctxp;
+        struct perf_event_context *child_ctx;
        if (!event->attr.inherit) {
                *inherited_all = 0;
                return 0;
        }
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
@@ -5586,14 +7063,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                 * child.
                 */
-                child_ctx = kzalloc(sizeof(struct perf_event_context),
+                child_ctx = alloc_perf_context(event->pmu, child);
-                                    GFP_KERNEL);
                if (!child_ctx)
                        return -ENOMEM;
-                __perf_event_init_context(child_ctx, child);
+                child->perf_event_ctxp[ctxn] = child_ctx;
-                child->perf_event_ctxp = child_ctx;
-                get_task_struct(child);
        }
        ret = inherit_group(event, parent, parent_ctx,
@@ -5605,32 +7079,27 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
        return ret;
 }
 /*
 * Initialize the perf_event context in task_struct
 */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
+        unsigned long flags;
        int ret = 0;
-        child->perf_event_ctxp = NULL;
+        if (likely(!parent->perf_event_ctxp[ctxn]))
-        mutex_init(&child->perf_event_mutex);
-        INIT_LIST_HEAD(&child->perf_event_list);
-        if (likely(!parent->perf_event_ctxp))
                return 0;
        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
-        parent_ctx = perf_pin_task_context(parent);
+        parent_ctx = perf_pin_task_context(parent, ctxn);
        /*
         * No need to check if parent_ctx != NULL here; since we saw
@@ -5650,31 +7119,42 @@ int perf_event_init_task(struct task_struct *child)
         * the list, not manipulating it:
         */
        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-                ret = inherit_task_group(event, parent, parent_ctx, child,
+                ret = inherit_task_group(event, parent, parent_ctx,
-                                         &inherited_all);
+                                         child, ctxn, &inherited_all);
                if (ret)
                        break;
        }
+        /*
+         * We can't hold ctx->lock when iterating the ->flexible_group list due
+         * to allocations, but we need to prevent rotation because
+         * rotate_ctx() will change the list from interrupt context.
+         */
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 1;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-                ret = inherit_task_group(event, parent, parent_ctx, child,
+                ret = inherit_task_group(event, parent, parent_ctx,
-                                         &inherited_all);
+                                         child, ctxn, &inherited_all);
                if (ret)
                        break;
        }
-        child_ctx = child->perf_event_ctxp;
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 0;
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
-                 * Note that if the parent is a clone, it could get
+                 *
-                 * uncloned at any point, but that doesn't matter
+                 * Note that if the parent is a clone, the holding of
-                 * because the list of events and the generation
+                 * parent_ctx->lock avoids it from being uncloned.
-                 * count can't have changed since we took the mutex.
                 */
-                cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -5685,75 +7165,136 @@ int perf_event_init_task(struct task_struct *child)
                get_ctx(child_ctx->parent_ctx);
        }
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        mutex_unlock(&parent_ctx->mutex);
        perf_unpin_context(parent_ctx);
+        put_ctx(parent_ctx);
        return ret;
 }
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+        int ctxn, ret;
+        memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+        mutex_init(&child->perf_event_mutex);
+        INIT_LIST_HEAD(&child->perf_event_list);
+        for_each_task_context_nr(ctxn) {
+                ret = perf_event_init_context(child, ctxn);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
 static void __init perf_event_init_all_cpus(void)
 {
+        struct swevent_htable *swhash;
        int cpu;
-        struct perf_cpu_context *cpuctx;
        for_each_possible_cpu(cpu) {
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                swhash = &per_cpu(swevent_htable, cpu);
-                mutex_init(&cpuctx->hlist_mutex);
+                mutex_init(&swhash->hlist_mutex);
-                __perf_event_init_context(&cpuctx->ctx, NULL);
+                INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
        }
 }
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
-        struct perf_cpu_context *cpuctx;
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        cpuctx = &per_cpu(perf_cpu_context, cpu);
+        mutex_lock(&swhash->hlist_mutex);
+        if (swhash->hlist_refcount > 0) {
-        spin_lock(&perf_resource_lock);
-        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
-        spin_unlock(&perf_resource_lock);
-        mutex_lock(&cpuctx->hlist_mutex);
-        if (cpuctx->hlist_refcount > 0) {
                struct swevent_hlist *hlist;
-                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
-                WARN_ON_ONCE(!hlist);
+                WARN_ON(!hlist);
-                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
 }
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void __perf_event_exit_cpu(void *info)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-        struct perf_event_context *ctx = &cpuctx->ctx;
+        WARN_ON(!irqs_disabled());
+        list_del_init(&cpuctx->rotation_list);
+}
+static void __perf_event_exit_context(void *__info)
+{
+        struct perf_event_context *ctx = __info;
        struct perf_event *event, *tmp;
+        perf_pmu_rotate_stop(ctx->pmu);
        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-                __perf_event_remove_from_context(event);
+                __perf_remove_from_context(event);
        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-                __perf_event_remove_from_context(event);
+                __perf_remove_from_context(event);
+}
+static void perf_event_exit_cpu_context(int cpu)
+{
+        struct perf_event_context *ctx;
+        struct pmu *pmu;
+        int idx;
+        idx = srcu_read_lock(&pmus_srcu);
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+                mutex_lock(&ctx->mutex);
+                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+                mutex_unlock(&ctx->mutex);
+        }
+        srcu_read_unlock(&pmus_srcu, idx);
 }
 static void perf_event_exit_cpu(int cpu)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-        struct perf_event_context *ctx = &cpuctx->ctx;
-        mutex_lock(&cpuctx->hlist_mutex);
+        mutex_lock(&swhash->hlist_mutex);
-        swevent_hlist_release(cpuctx);
+        swevent_hlist_release(swhash);
-        mutex_unlock(&cpuctx->hlist_mutex);
+        mutex_unlock(&swhash->hlist_mutex);
-        mutex_lock(&ctx->mutex);
+        perf_event_exit_cpu_context(cpu);
-        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-        mutex_unlock(&ctx->mutex);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
 #endif
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                perf_event_exit_cpu(cpu);
+        return NOTIFY_OK;
+}
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+        .notifier_call = perf_reboot,
+        .priority = INT_MIN,
+};
 static int __cpuinit
 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
@@ -5778,118 +7319,115 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-        .notifier_call          = perf_cpu_notify,
-        .priority               = 20,
-};
 void __init perf_event_init(void)
 {
+        int ret;
+        idr_init(&pmu_idr);
        perf_event_init_all_cpus();
-        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+        init_srcu_struct(&pmus_srcu);
-                        (void *)(long)smp_processor_id());
+        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
-        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+        perf_pmu_register(&perf_cpu_clock, NULL, -1);
-                        (void *)(long)smp_processor_id());
+        perf_pmu_register(&perf_task_clock, NULL, -1);
-        register_cpu_notifier(&perf_cpu_nb);
+        perf_tp_register();
-}
+        perf_cpu_notifier(perf_cpu_notify);
+        register_reboot_notifier(&perf_reboot_notifier);
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
+        ret = init_hw_breakpoint();
-                                        struct sysdev_class_attribute *attr,
+        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
-                                        char *buf)
-{
-        return sprintf(buf, "%d\n", perf_reserved_percpu);
 }
-static ssize_t
+static int __init perf_event_sysfs_init(void)
-perf_set_reserve_percpu(struct sysdev_class *class,
-                        struct sysdev_class_attribute *attr,
-                        const char *buf,
-                        size_t count)
 {
-        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
-        unsigned long val;
+        int ret;
-        int err, cpu, mpt;
-        err = strict_strtoul(buf, 10, &val);
+        mutex_lock(&pmus_lock);
-        if (err)
-                return err;
+        ret = bus_register(&pmu_bus);
-        if (val > perf_max_events)
+        if (ret)
-                return -EINVAL;
+                goto unlock;
+        list_for_each_entry(pmu, &pmus, entry) {
+                if (!pmu->name || pmu->type < 0)
+                        continue;
-        spin_lock(&perf_resource_lock);
+                ret = pmu_dev_alloc(pmu);
-        perf_reserved_percpu = val;
+                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
-        for_each_online_cpu(cpu) {
-                cpuctx = &per_cpu(perf_cpu_context, cpu);
-                raw_spin_lock_irq(&cpuctx->ctx.lock);
-                mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-                          perf_max_events - perf_reserved_percpu);
-                cpuctx->max_pertask = mpt;
-                raw_spin_unlock_irq(&cpuctx->ctx.lock);
        }
-        spin_unlock(&perf_resource_lock);
+        pmu_bus_running = 1;
+        ret = 0;
-        return count;
+unlock:
-}
+        mutex_unlock(&pmus_lock);
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
+        return ret;
-                                    struct sysdev_class_attribute *attr,
-                                    char *buf)
-{
-        return sprintf(buf, "%d\n", perf_overcommit);
 }
+device_initcall(perf_event_sysfs_init);
-static ssize_t
+#ifdef CONFIG_CGROUP_PERF
-perf_set_overcommit(struct sysdev_class *class,
+static struct cgroup_subsys_state *perf_cgroup_create(
-                    struct sysdev_class_attribute *attr,
+        struct cgroup_subsys *ss, struct cgroup *cont)
-                    const char *buf, size_t count)
 {
-        unsigned long val;
+        struct perf_cgroup *jc;
-        int err;
-        err = strict_strtoul(buf, 10, &val);
+        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
-        if (err)
+        if (!jc)
-                return err;
+                return ERR_PTR(-ENOMEM);
-        if (val > 1)
-                return -EINVAL;
-        spin_lock(&perf_resource_lock);
+        jc->info = alloc_percpu(struct perf_cgroup_info);
-        perf_overcommit = val;
+        if (!jc->info) {
-        spin_unlock(&perf_resource_lock);
+                kfree(jc);
+                return ERR_PTR(-ENOMEM);
+        }
-        return count;
+        return &jc->css;
 }
-static SYSDEV_CLASS_ATTR(
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
-                                reserve_percpu,
+                                struct cgroup *cont)
-                                0644,
+{
-                                perf_show_reserve_percpu,
+        struct perf_cgroup *jc;
-                                perf_set_reserve_percpu
+        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
-                        );
+                          struct perf_cgroup, css);
+        free_percpu(jc->info);
-static SYSDEV_CLASS_ATTR(
+        kfree(jc);
-                                overcommit,
+}
-                                0644,
-                                perf_show_overcommit,
-                                perf_set_overcommit
-                        );
-static struct attribute *perfclass_attrs[] = {
+static int __perf_cgroup_move(void *info)
-        &attr_reserve_percpu.attr,
+{
-        &attr_overcommit.attr,
+        struct task_struct *task = info;
-        NULL
+        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
-};
+        return 0;
+}
-static struct attribute_group perfclass_attr_group = {
+static void
-        .attrs                  = perfclass_attrs,
+perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
-        .name                   = "perf_events",
+{
-};
+        task_function_call(task, __perf_cgroup_move, task);
+}
-static int __init perf_event_sysfs_init(void)
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task)
 {
-        return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+        /*
-                                  &perfclass_attr_group);
+         * cgroup_exit() is called in the copy_process() failure path.
+         * Ignore this case since the task hasn't ran yet, this avoids
+         * trying to poke a half freed task state from generic code.
+         */
+        if (!(task->flags & PF_EXITING))
+                return;
+        perf_cgroup_attach_task(cgrp, task);
 }
-device_initcall(perf_event_sysfs_init);
+struct cgroup_subsys perf_subsys = {
+        .name           = "perf_event",
+        .subsys_id      = perf_subsys_id,
+        .create         = perf_cgroup_create,
+        .destroy        = perf_cgroup_destroy,
+        .exit           = perf_cgroup_exit,
+        .attach_task    = perf_cgroup_attach_task,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c7c2aed9e2dc..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 */
 static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-        struct perf_event_context *ctx = bp->ctx;
+        struct task_struct *tsk = bp->hw.bp_target;
        struct perf_event *iter;
        int count = 0;
        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                if (iter->ctx == ctx && find_slot_idx(iter) == type)
+                if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
                        count += hw_breakpoint_weight(iter);
        }
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                    enum bp_type_idx type)
 {
        int cpu = bp->cpu;
-        struct task_struct *tsk = bp->ctx->task;
+        struct task_struct *tsk = bp->hw.bp_target;
        if (cpu >= 0) {
                slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
               int weight)
 {
        int cpu = bp->cpu;
-        struct task_struct *tsk = bp->ctx->task;
+        struct task_struct *tsk = bp->hw.bp_target;
        /* Pinned counter cpu profiling */
        if (!tsk) {
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
                            struct task_struct *tsk)
 {
-        return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
+        return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
-                                                triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-                bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+                bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
                *pevent = bp;
@@ -566,7 +565,62 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
        .priority = 0x7fffffff
 };
-static int __init init_hw_breakpoint(void)
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+        release_bp_slot(event);
+}
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+        int err;
+        if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+                return -ENOENT;
+        err = register_perf_hw_breakpoint(bp);
+        if (err)
+                return err;
+        bp->destroy = bp_perf_event_destroy;
+        return 0;
+}
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+        if (!(flags & PERF_EF_START))
+                bp->hw.state = PERF_HES_STOPPED;
+        return arch_install_hw_breakpoint(bp);
+}
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+        arch_uninstall_hw_breakpoint(bp);
+}
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+        bp->hw.state = 0;
+}
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+        bp->hw.state = PERF_HES_STOPPED;
+}
+static struct pmu perf_breakpoint = {
+        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
+        .event_init     = hw_breakpoint_event_init,
+        .add            = hw_breakpoint_add,
+        .del            = hw_breakpoint_del,
+        .start          = hw_breakpoint_start,
+        .stop           = hw_breakpoint_stop,
+        .read           = hw_breakpoint_pmu_read,
+};
+int __init init_hw_breakpoint(void)
 {
        unsigned int **task_bp_pinned;
        int cpu, err_cpu;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
        constraints_initialized = 1;
+        perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
        return register_die_notifier(&hw_breakpoint_exceptions_nb);
 err_alloc:
@@ -599,11 +655,5 @@ static int __init init_hw_breakpoint(void)
        return -ENOMEM;
 }
-core_initcall(init_hw_breakpoint);
-struct pmu perf_ops_bp = {
-        .enable         = arch_install_hw_breakpoint,
-        .disable        = arch_uninstall_hw_breakpoint,
-        .read           = hw_breakpoint_pmu_read,
-};
diff --git a/kernel/exit.c b/kernel/exit.c
index b9d3bc6c21ec..64879bdff921 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -70,7 +71,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
-                __get_cpu_var(process_counts)--;
+                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_group);
 }
@@ -97,6 +98,14 @@ static void __exit_signal(struct task_struct *tsk)
                sig->tty = NULL;
        } else {
                /*
+                 * This can only happen if the caller is de_thread().
+                 * FIXME: this is the temporary hack, we should teach
+                 * posix-cpu-timers to handle this case correctly.
+                 */
+                if (unlikely(has_group_leader_pid(tsk)))
+                        posix_cpu_timers_exit_group(tsk);
+                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
@@ -151,9 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-#ifdef CONFIG_PERF_EVENTS
+        perf_event_delayed_put(tsk);
-        WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
 }
@@ -556,29 +563,28 @@ void exit_files(struct task_struct *tsk)
 #ifdef CONFIG_MM_OWNER
 /*
- * Task p is exiting and it owned mm, lets find a new owner for it
+ * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
-        /*
-         * If there are other users of the mm and the owner (us) is exiting
-         * we need to find a new owner to take on the responsibility.
-         */
-        if (atomic_read(&mm->mm_users) <= 1)
-                return 0;
-        if (mm->owner != p)
-                return 0;
-        return 1;
-}
 void mm_update_next_owner(struct mm_struct *mm)
 {
        struct task_struct *c, *g, *p = current;
 retry:
-        if (!mm_need_new_owner(mm, p))
+        /*
+         * If the exiting or execing task is not the owner, it's
+         * someone else's problem.
+         */
+        if (mm->owner != p)
+                return;
+        /*
+         * The current owner is exiting/execing and there are no other
+         * candidates.  Do not leave the mm pointing to a possibly
+         * freed task structure.
+         */
+        if (atomic_read(&mm->mm_users) <= 1) {
+                mm->owner = NULL;
                return;
+        }
        read_lock(&tasklist_lock);
        /*
@@ -691,6 +697,8 @@ static void exit_mm(struct task_struct * tsk)
        enter_lazy_tlb(mm, current);
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
@@ -704,6 +712,8 @@ static void exit_mm(struct task_struct * tsk)
 * space.
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *thread;
@@ -832,7 +842,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /* Let father know we died
         *
         * Thread signals are configurable, but you aren't going to use
-         * that to send signals to arbitary processes.
+         * that to send signals to arbitrary processes.
         * That stops right now.
         *
         * If the parent exec id doesn't match the exec id we saved
@@ -899,12 +909,22 @@ NORET_TYPE void do_exit(long code)
        profile_task_exit(tsk);
        WARN_ON(atomic_read(&tsk->fs_excl));
+        WARN_ON(blk_needs_flush_plug(tsk));
        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        /*
+         * If do_exit is called because this processes oopsed, it's possible
+         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+         * continuing. Amongst other possible reasons, this is to prevent
+         * mm_release()->clear_child_tid() from writing to a user-controlled
+         * kernel address.
+         */
+        set_fs(USER_DS);
        tracehook_report_exit(&code);
        validate_creds_for_do_exit(tsk);
@@ -978,6 +998,15 @@ NORET_TYPE void do_exit(long code)
        exit_fs(tsk);
        check_stack_usage();
        exit_thread();
+        /*
+         * Flush inherited counters to the parent - before the parent
+         * gets woken up by child-exit notifications.
+         *
+         * because of cgroup mode, must be called before cgroup_exit()
+         */
+        perf_event_exit_task(tsk);
        cgroup_exit(tsk, 1);
        if (group_dead)
@@ -990,12 +1019,7 @@ NORET_TYPE void do_exit(long code)
        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
-        flush_ptrace_hw_breakpoint(tsk);
+        ptrace_put_breakpoints(tsk);
-        /*
-         * Flush inherited counters to the parent - before the parent
-         * gets woken up by child-exit notifications.
-         */
-        perf_event_exit_task(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
@@ -1356,11 +1380,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
        return NULL;
 }
-/*
+/**
- * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
- * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * @wo: wait options
- * the lock and this task is uninteresting.  If we return nonzero, we have
+ * @ptrace: is the wait for ptrace
- * released the lock and the system call should return.
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero.  Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue.  Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
 */
 static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
@@ -1376,6 +1412,9 @@ static int wait_task_stopped(struct wait_opts *wo,
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;
+        if (!task_stopped_code(p, ptrace))
+                return 0;
        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);
@@ -1517,33 +1556,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                return 0;
        }
-        if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+        /* dead body doesn't have much to contribute */
+        if (p->exit_state == EXIT_DEAD)
+                return 0;
+        /* slay zombie? */
+        if (p->exit_state == EXIT_ZOMBIE) {
                /*
-                 * This child is hidden by ptrace.
+                 * A zombie ptracee is only visible to its ptracer.
-                 * We aren't allowed to see it now, but eventually we will.
+                 * Notification and reaping will be cascaded to the real
+                 * parent when the ptracer detaches.
+                 */
+                if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+                        /* it will become visible, clear notask_error */
+                        wo->notask_error = 0;
+                        return 0;
+                }
+                /* we don't reap group leaders with subthreads */
+                if (!delay_group_leader(p))
+                        return wait_task_zombie(wo, p);
+                /*
+                 * Allow access to stopped/continued state via zombie by
+                 * falling through.  Clearing of notask_error is complex.
+                 *
+                 * When !@ptrace:
+                 *
+                 * If WEXITED is set, notask_error should naturally be
+                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
+                 * so, if there are live subthreads, there are events to
+                 * wait for.  If all subthreads are dead, it's still safe
+                 * to clear - this function will be called again in finite
+                 * amount time once all the subthreads are released and
+                 * will then return without clearing.
+                 *
+                 * When @ptrace:
+                 *
+                 * Stopped state is per-task and thus can't change once the
+                 * target task dies.  Only continued and exited can happen.
+                 * Clear notask_error if WCONTINUED | WEXITED.
+                 */
+                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+                        wo->notask_error = 0;
+        } else {
+                /*
+                 * If @p is ptraced by a task in its real parent's group,
+                 * hide group stop/continued state when looking at @p as
+                 * the real parent; otherwise, a single stop can be
+                 * reported twice as group and ptrace stops.
+                 *
+                 * If a ptracer wants to distinguish the two events for its
+                 * own children, it should create a separate process which
+                 * takes the role of real parent.
+                 */
+                if (likely(!ptrace) && task_ptrace(p) &&
+                    same_thread_group(p->parent, p->real_parent))
+                        return 0;
+                /*
+                 * @p is alive and it's gonna stop, continue or exit, so
+                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
-                return 0;
        }
-        if (p->exit_state == EXIT_DEAD)
-                return 0;
        /*
-         * We don't reap group leaders with subthreads.
+         * Wait for stopped.  Depending on @ptrace, different stopped state
+         * is used and the two don't interact with each other.
         */
-        if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+        ret = wait_task_stopped(wo, ptrace, p);
-                return wait_task_zombie(wo, p);
+        if (ret)
+                return ret;
        /*
-         * It's stopped or running now, so it might
+         * Wait for continued.  There's only one continued state and the
-         * later continue, exit, or stop again.
+         * ptracer can consume it which can confuse the real parent.  Don't
+         * use WCONTINUED from ptracer.  You don't need or want it.
         */
-        wo->notask_error = 0;
-        if (task_stopped_code(p, ptrace))
-                return wait_task_stopped(wo, ptrace, p);
        return wait_task_continued(wo, p);
 }
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
        return 0;
 }
+/**
+ * core_kernel_data - tell if addr points to kernel data
+ * @addr: address to test
+ *
+ * Returns true if @addr passed in is from the core kernel data
+ * section.
+ *
+ * Note: On some archs it may return true for core RODATA, and false
+ *  for others. But will always be true for core RW data.
+ */
+int core_kernel_data(unsigned long addr)
+{
+        if (addr >= (unsigned long)_sdata &&
+            addr < (unsigned long)_edata)
+                return 1;
+        return 0;
+}
 int __kernel_text_address(unsigned long addr)
 {
        if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index ab7f29d906c7..25c6111fe3a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/kthread.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
@@ -58,13 +59,14 @@
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/tty.h>
-#include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
+#include <linux/khugepaged.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -110,20 +112,25 @@ int nr_processes(void)
 }
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define alloc_task_struct_node(node)           \
-# define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
+                kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk)                  \
+                kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
 #endif
 #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+                                                  int node)
 {
 #ifdef CONFIG_DEBUG_STACK_USAGE
        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
 #else
        gfp_t mask = GFP_KERNEL;
 #endif
-        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+        struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+        return page ? page_address(page) : NULL;
 }
 static inline void free_thread_info(struct thread_info *ti)
@@ -171,6 +178,7 @@ EXPORT_SYMBOL(free_task);
 static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
+        sched_autogroup_exit(sig);
        kmem_cache_free(signal_cachep, sig);
 }
@@ -194,6 +202,7 @@ void __put_task_struct(struct task_struct *tsk)
        if (!profile_handoff_task(tsk))
                free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);
 /*
 * macro override instead of weak attribute alias, to workaround
@@ -249,16 +258,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        struct task_struct *tsk;
        struct thread_info *ti;
        unsigned long *stackend;
+        int node = tsk_fork_get_node(orig);
        int err;
        prepare_to_copy(orig);
-        tsk = alloc_task_struct();
+        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
-        ti = alloc_thread_info(tsk);
+        ti = alloc_thread_info_node(tsk, node);
        if (!ti) {
                free_task_struct(tsk);
                return NULL;
@@ -279,6 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
+        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
@@ -334,6 +344,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+        retval = khugepaged_fork(mm, oldmm);
+        if (retval)
+                goto out;
        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -376,15 +389,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
-                        spin_lock(&mapping->i_mmap_lock);
+                        mutex_lock(&mapping->i_mmap_mutex);
                        if (tmp->vm_flags & VM_SHARED)
                                mapping->i_mmap_writable++;
-                        tmp->vm_truncate_count = mpnt->vm_truncate_count;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_prio_tree_add(tmp, mpnt);
                        flush_dcache_mmap_unlock(mapping);
-                        spin_unlock(&mapping->i_mmap_lock);
+                        mutex_unlock(&mapping->i_mmap_mutex);
                }
                /*
@@ -495,6 +507,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        atomic_set(&mm->oom_disable_count, 0);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -514,11 +527,12 @@ struct mm_struct * mm_alloc(void)
        struct mm_struct * mm;
        mm = allocate_mm();
-        if (mm) {
+        if (!mm)
-                memset(mm, 0, sizeof(*mm));
+                return NULL;
-                mm = mm_init(mm, current);
-        }
+        memset(mm, 0, sizeof(*mm));
-        return mm;
+        mm_init_cpumask(mm);
+        return mm_init(mm, current);
 }
 /*
@@ -532,6 +546,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -546,6 +563,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+                khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -561,6 +579,57 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas++;
+}
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas--;
+        if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+                fput(mm->exe_file);
+                mm->exe_file = NULL;
+        }
+}
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+        if (new_exe_file)
+                get_file(new_exe_file);
+        if (mm->exe_file)
+                fput(mm->exe_file);
+        mm->exe_file = new_exe_file;
+        mm->num_exe_file_vmas = 0;
+}
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+        struct file *exe_file;
+        /* We need mmap_sem to protect against races with removal of
+         * VM_EXECUTABLE vmas */
+        down_read(&mm->mmap_sem);
+        exe_file = mm->exe_file;
+        if (exe_file)
+                get_file(exe_file);
+        up_read(&mm->mmap_sem);
+        return exe_file;
+}
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+        /* It's safe to write the exe_file pointer without exe_file_lock because
+         * this is called during fork when the task is not yet in /proc */
+        newmm->exe_file = get_mm_exe_file(oldmm);
+}
 /**
 * get_task_mm - acquire a reference to the task's mm
 *
@@ -667,11 +736,16 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
                goto fail_nomem;
        memcpy(mm, oldmm, sizeof(*mm));
+        mm_init_cpumask(mm);
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        mm->pmd_huge_pte = NULL;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -748,6 +822,8 @@ good_mm:
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_inc(&mm->oom_disable_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
@@ -907,9 +983,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        posix_cpu_timers_init_group(sig);
        tty_audit_fork(sig);
+        sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+        init_rwsem(&sig->threadgroup_fork_lock);
+#endif
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+        mutex_init(&sig->cred_guard_mutex);
        return 0;
 }
@@ -1081,12 +1165,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        posix_cpu_timers_init(p);
-        p->lock_depth = -1;             /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->real_start_time = p->start_time;
        monotonic_to_bootbased(&p->real_start_time);
        p->io_context = NULL;
        p->audit_context = NULL;
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_lock(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1131,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        /* Perform scheduler related setup. Assign this task to a CPU. */
-        sched_fork(p, clone_flags);
+        sched_fork(p);
        retval = perf_event_init_task(p);
        if (retval)
@@ -1165,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                pid = alloc_pid(p->nsproxy->pid_ns);
                if (!pid)
                        goto bad_fork_cleanup_io;
-                if (clone_flags & CLONE_NEWPID) {
-                        retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
-                        if (retval < 0)
-                                goto bad_fork_free_pid;
-                }
        }
        p->pid = pid_nr(pid);
@@ -1178,17 +1257,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
-        if (current->nsproxy != p->nsproxy) {
-                retval = ns_cgroup_clone(p, pid);
-                if (retval)
-                        goto bad_fork_free_pid;
-        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+        p->plug = NULL;
+#endif
 #ifdef CONFIG_FUTEX
        p->robust_list = NULL;
 #ifdef CONFIG_COMPAT
@@ -1274,7 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                tracehook_finish_clone(p, clone_flags, trace);
                if (thread_group_leader(p)) {
-                        if (clone_flags & CLONE_NEWPID)
+                        if (is_child_reaper(pid))
                                p->nsproxy->pid_ns->child_reaper = p;
                        p->signal->leader_pid = pid;
@@ -1283,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        attach_pid(p, PIDTYPE_SID, task_session(current));
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
-                        __get_cpu_var(process_counts)++;
+                        __this_cpu_inc(process_counts);
                }
                attach_pid(p, PIDTYPE_PID, pid);
                nr_threads++;
@@ -1294,6 +1370,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_unlock(current);
        perf_event_fork(p);
        return p;
@@ -1306,8 +1384,13 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm)
+        if (p->mm) {
+                task_lock(p);
+                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&p->mm->oom_disable_count);
+                task_unlock(p);
                mmput(p->mm);
+        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -1327,6 +1410,8 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
 #endif
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_unlock(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
@@ -1403,23 +1488,6 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * We hope to recycle these flags after 2.6.26
-         */
-        if (unlikely(clone_flags & CLONE_STOPPED)) {
-                static int __read_mostly count = 100;
-                if (count > 0 && printk_ratelimit()) {
-                        char comm[TASK_COMM_LEN];
-                        count--;
-                        printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                        "clone flags 0x%lx\n",
-                                get_task_comm(comm, current),
-                                clone_flags & CLONE_STOPPED);
-                }
-        }
-        /*
         * When called from kernel_thread, don't do user tracing stuff.
         */
        if (likely(user_mode(regs)))
@@ -1457,16 +1525,7 @@ long do_fork(unsigned long clone_flags,
                 */
                p->flags &= ~PF_STARTING;
-                if (unlikely(clone_flags & CLONE_STOPPED)) {
+                wake_up_new_task(p);
-                        /*
-                         * We'll start up with an immediate SIGSTOP.
-                         */
-                        sigaddset(&p->pending.signal, SIGSTOP);
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                        __set_task_state(p, TASK_STOPPED);
-                } else {
-                        wake_up_new_task(p, clone_flags);
-                }
                tracehook_report_clone_complete(trace, regs,
                                                clone_flags, nr, p);
@@ -1510,6 +1569,13 @@ void __init proc_caches_init(void)
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+        /*
+         * FIXME! The "sizeof(struct mm_struct)" currently includes the
+         * whole struct cpumask for the OFFSTACK case. We could change
+         * this to *only* allocate as much of it as required by the
+         * maximum number of CPU's we can ever have.  The cpumask_allocation
+         * is at the end of the structure, exactly for that reason.
+         */
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
@@ -1518,38 +1584,24 @@ void __init proc_caches_init(void)
 }
 /*
- * Check constraints on flags passed to the unshare system call and
+ * Check constraints on flags passed to the unshare system call.
- * force unsharing of additional process context as appropriate.
 */
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
 {
+        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+                return -EINVAL;
        /*
-         * If unsharing a thread from a thread group, must also
+         * Not implemented, but pretend it works if there is nothing to
-         * unshare vm.
+         * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
-         */
+         * needs to unshare vm.
-        if (*flags_ptr & CLONE_THREAD)
-                *flags_ptr |= CLONE_VM;
-        /*
-         * If unsharing vm, must also unshare signal handlers.
-         */
-        if (*flags_ptr & CLONE_VM)
-                *flags_ptr |= CLONE_SIGHAND;
-        /*
-         * If unsharing namespace, must also unshare filesystem information.
         */
-        if (*flags_ptr & CLONE_NEWNS)
+        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
-                *flags_ptr |= CLONE_FS;
+                /* FIXME: get_task_mm() increments ->mm_users */
-}
+                if (atomic_read(&current->mm->mm_users) > 1)
+                        return -EINVAL;
-/*
+        }
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
-        if (unshare_flags & CLONE_THREAD)
-                return -EINVAL;
        return 0;
 }
@@ -1576,34 +1628,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 /*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
-        struct sighand_struct *sigh = current->sighand;
-        if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-                return -EINVAL;
-        else
-                return 0;
-}
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
-        struct mm_struct *mm = current->mm;
-        if ((unshare_flags & CLONE_VM) &&
-            (mm && atomic_read(&mm->mm_users) > 1)) {
-                return -EINVAL;
-        }
-        return 0;
-}
-/*
 * Unshare file descriptor table if it is being shared
 */
 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1631,45 +1655,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 */
 SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
-        int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-        struct sighand_struct *new_sigh = NULL;
-        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
+        int err;
-        check_unshare_flags(&unshare_flags);
+        err = check_unshare_flags(unshare_flags);
+        if (err)
-        /* Return -EINVAL for all unsupported flags */
-        err = -EINVAL;
-        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
                goto bad_unshare_out;
        /*
+         * If unsharing namespace, must also unshare filesystem information.
+         */
+        if (unshare_flags & CLONE_NEWNS)
+                unshare_flags |= CLONE_FS;
+        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
-        if ((err = unshare_thread(unshare_flags)))
-                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
-                goto bad_unshare_cleanup_thread;
+                goto bad_unshare_out;
-        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
-                goto bad_unshare_cleanup_fs;
-        if ((err = unshare_vm(unshare_flags, &new_mm)))
-                goto bad_unshare_cleanup_sigh;
        if ((err = unshare_fd(unshare_flags, &new_fd)))
-                goto bad_unshare_cleanup_vm;
+                goto bad_unshare_cleanup_fs;
        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                        new_fs)))
                goto bad_unshare_cleanup_fd;
-        if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1695,15 +1711,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        spin_unlock(&fs->lock);
                }
-                if (new_mm) {
-                        mm = current->mm;
-                        active_mm = current->active_mm;
-                        current->mm = new_mm;
-                        current->active_mm = new_mm;
-                        activate_mm(active_mm, new_mm);
-                        new_mm = mm;
-                }
                if (new_fd) {
                        fd = current->files;
                        current->files = new_fd;
@@ -1720,20 +1727,10 @@ bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
-        if (new_mm)
-                mmput(new_mm);
-bad_unshare_cleanup_sigh:
-        if (new_sigh)
-                if (atomic_dec_and_test(&new_sigh->count))
-                        kmem_cache_free(sighand_cachep, new_sigh);
 bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
 bad_unshare_out:
        return err;
 }
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
 {
        if (!unlikely(current->flags & PF_NOFREEZE)) {
                current->flags |= PF_FROZEN;
-                wmb();
+                smp_wmb();
        }
        clear_freeze_flag(current);
 }
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
         * the task as frozen and next clears its TIF_FREEZE.
         */
        if (!freezing(p)) {
-                rmb();
+                smp_rmb();
                if (frozen(p))
                        return false;
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
        }
        if (should_send_signal(p)) {
-                if (!signal_pending(p))
+                fake_signal_wake_up(p);
-                        fake_signal_wake_up(p);
+                /*
+                 * fake_signal_wake_up() goes through p's scheduler
+                 * lock and guarantees that TASK_STOPPED/TRACED ->
+                 * TASK_RUNNING transition can't race with task state
+                 * testing in try_to_freeze_tasks().
+                 */
        } else if (sig_only) {
                return false;
        } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 /*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED            0x01
+#define FLAGS_CLOCKRT           0x02
+#define FLAGS_HAS_TIMEOUT       0x04
+/*
 * Priority Inheritance state:
 */
 struct futex_pi_state {
@@ -91,6 +99,7 @@ struct futex_pi_state {
 /**
 * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list:               priority-sorted list of tasks waiting on this futex
 * @task:               the task waiting on the futex
 * @lock_ptr:           the hash bucket lock
 * @key:                the key the futex is hashed on
@@ -104,7 +113,7 @@ struct futex_pi_state {
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
@@ -122,6 +131,12 @@ struct futex_q {
        u32 bitset;
 };
+static const struct futex_q futex_q_init = {
+        /* list gets initialized in queue_me()*/
+        .key = FUTEX_KEY_INIT,
+        .bitset = FUTEX_BITSET_MATCH_ANY
+};
 /*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
@@ -168,7 +183,7 @@ static void get_futex_key_refs(union futex_key *key)
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-                atomic_inc(&key->shared.inode->i_count);
+                ihold(key->shared.inode);
                break;
        case FUT_OFF_MMSHARED:
                atomic_inc(&key->private.mm->mm_count);
@@ -218,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *page_head;
        int err;
        /*
@@ -250,11 +265,46 @@ again:
        if (err < 0)
                return err;
-        page = compound_head(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        lock_page(page);
+        page_head = page;
-        if (!page->mapping) {
+        if (unlikely(PageTail(page))) {
-                unlock_page(page);
                put_page(page);
+                /* serialize against __split_huge_page_splitting() */
+                local_irq_disable();
+                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                        page_head = compound_head(page);
+                        /*
+                         * page_head is valid pointer but we must pin
+                         * it before taking the PG_lock and/or
+                         * PG_compound_lock. The moment we re-enable
+                         * irqs __split_huge_page_splitting() can
+                         * return and the head page can be freed from
+                         * under us. We can't take the PG_lock and/or
+                         * PG_compound_lock on a page that could be
+                         * freed from under us.
+                         */
+                        if (page != page_head) {
+                                get_page(page_head);
+                                put_page(page);
+                        }
+                        local_irq_enable();
+                } else {
+                        local_irq_enable();
+                        goto again;
+                }
+        }
+#else
+        page_head = compound_head(page);
+        if (page != page_head) {
+                get_page(page_head);
+                put_page(page);
+        }
+#endif
+        lock_page(page_head);
+        if (!page_head->mapping) {
+                unlock_page(page_head);
+                put_page(page_head);
                goto again;
        }
@@ -265,25 +315,24 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-        if (PageAnon(page)) {
+        if (PageAnon(page_head)) {
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-                key->shared.inode = page->mapping->host;
+                key->shared.inode = page_head->mapping->host;
-                key->shared.pgoff = page->index;
+                key->shared.pgoff = page_head->index;
        }
        get_futex_key_refs(key);
-        unlock_page(page);
+        unlock_page(page_head);
-        put_page(page);
+        put_page(page_head);
        return 0;
 }
-static inline
+static inline void put_futex_key(union futex_key *key)
-void put_futex_key(int fshared, union futex_key *key)
 {
        drop_futex_key_refs(key);
 }
@@ -295,7 +344,7 @@ void put_futex_key(int fshared, union futex_key *key)
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
@@ -332,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
        return NULL;
 }
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+                                      u32 uval, u32 newval)
 {
-        u32 curval;
+        int ret;
        pagefault_disable();
-        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
        pagefault_enable();
-        return curval;
+        return ret;
 }
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -515,7 +565,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                         */
                        pi_state = this->pi_state;
                        /*
-                         * Userspace might have messed up non PI and PI futexes
+                         * Userspace might have messed up non-PI and PI futexes
                         */
                        if (unlikely(!pi_state))
                                return -EINVAL;
@@ -625,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct task_struct *task, int set_waiters)
 {
        int lock_taken, ret, ownerdied = 0;
-        u32 uval, newval, curval;
+        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 retry:
        ret = lock_taken = 0;
@@ -635,19 +685,17 @@ retry:
         * (by doing a 0 -> TID atomic cmpxchg), while holding all
         * the locks. It will most likely not succeed.
         */
-        newval = task_pid_vnr(task);
+        newval = vpid;
        if (set_waiters)
                newval |= FUTEX_WAITERS;
-        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
-        if (unlikely(curval == -EFAULT))
                return -EFAULT;
        /*
         * Detect deadlocks.
         */
-        if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
                return -EDEADLK;
        /*
@@ -674,14 +722,12 @@ retry:
         */
        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
                /* Keep the OWNER_DIED bit */
-                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+                newval = (curval & ~FUTEX_TID_MASK) | vpid;
                ownerdied = 0;
                lock_taken = 1;
        }
-        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
-        if (unlikely(curval == -EFAULT))
                return -EFAULT;
        if (unlikely(curval != uval))
                goto retry;
@@ -726,6 +772,24 @@ retry:
        return ret;
 }
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q:  The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+        struct futex_hash_bucket *hb;
+        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+            || WARN_ON(plist_node_empty(&q->list)))
+                return;
+        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+        plist_del(&q->list, &hb->chain);
+}
 /*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed.
@@ -736,14 +800,14 @@ static void wake_futex(struct futex_q *q)
        /*
         * We set q->lock_ptr = NULL _before_ we wake up the task. If
-         * a non futex wake up happens on another CPU then the task
+         * a non-futex wake up happens on another CPU then the task
-         * might exit and p would dereference a non existing task
+         * might exit and p would dereference a non-existing task
         * struct. Prevent this by holding a reference on p across the
         * wake up.
         */
        get_task_struct(p);
-        plist_del(&q->list, &q->list.plist);
+        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as
         * q->lock_ptr = NULL is written, without taking any locks. A
@@ -777,10 +841,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
        /*
-         * This happens when we have stolen the lock and the original
+         * It is possible that the next waiter (the one that brought
-         * pending owner did not enqueue itself back on the rt_mutex.
+         * this owner to the kernel) timed out and is no longer
-         * Thats not a tragedy. We know that way, that a lock waiter
+         * waiting on the lock.
-         * is on the fly. We make the futex_q waiter the pending owner.
         */
        if (!new_owner)
                new_owner = this->task;
@@ -795,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                if (curval == -EFAULT)
                        ret = -EFAULT;
                else if (curval != uval)
                        ret = -EINVAL;
@@ -832,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
         * There is no waiter, so we unlock the futex. The owner died
         * bit has not to be preserved here. We are the owner:
         */
-        oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
+        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+                return -EFAULT;
-        if (oldval == -EFAULT)
-                return oldval;
        if (oldval != uval)
                return -EAGAIN;
@@ -869,7 +928,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
 * Wake up waiters matching bitset queued on this futex (uaddr).
 */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -880,7 +940,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -906,7 +966,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        }
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 }
@@ -916,7 +976,7 @@ out:
 * to this virtual address:
 */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -926,10 +986,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -961,11 +1021,11 @@ retry_private:
                if (ret)
                        goto out_put_keys;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &key2);
+                put_futex_key(&key2);
-                put_futex_key(fshared, &key1);
+                put_futex_key(&key1);
                goto retry;
        }
@@ -995,9 +1055,9 @@ retry_private:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        return ret;
 }
@@ -1022,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
                plist_del(&q->list, &hb1->chain);
                plist_add(&q->list, &hb2->chain);
                q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-                q->list.plist.spinlock = &hb2->lock;
-#endif
        }
        get_futex_key_refs(key2);
        q->key = *key2;
@@ -1051,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
        get_futex_key_refs(key);
        q->key = *key;
-        WARN_ON(plist_node_empty(&q->list));
+        __unqueue_futex(q);
-        plist_del(&q->list, &q->list.plist);
        WARN_ON(!q->rt_waiter);
        q->rt_waiter = NULL;
        q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.spinlock = &hb->lock;
-#endif
        wake_up_state(q->task, TASK_NORMAL);
 }
@@ -1131,12 +1184,14 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1:      source futex user address
+ * @uaddr1:     source futex user address
- * uaddr2:      target futex user address
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
- * nr_wake:     number of waiters to wake (must be 1 for requeue_pi)
+ * @uaddr2:     target futex user address
- * nr_requeue:  number of waiters to requeue (0-INT_MAX)
+ * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
- * requeue_pi:  if we are attempting to requeue from a non-pi futex to a
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
- *              pi futex (pi to pi requeue is not supported)
+ * @cmpval:     @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ *              pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
@@ -1145,9 +1200,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * >=0 - on success, the number of tasks requeued or woken
 *  <0 - on error
 */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-                         int nr_wake, int nr_requeue, u32 *cmpval,
+                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
-                         int requeue_pi)
+                         u32 *cmpval, int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        int drop_count = 0, task_count = 0, ret;
@@ -1188,10 +1243,10 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1213,11 +1268,11 @@ retry_private:
                        if (ret)
                                goto out_put_keys;
-                        if (!fshared)
+                        if (!(flags & FLAGS_SHARED))
                                goto retry_private;
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        goto retry;
                }
                if (curval != *cmpval) {
@@ -1257,8 +1312,8 @@ retry_private:
                        break;
                case -EFAULT:
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
@@ -1266,8 +1321,8 @@ retry_private:
                case -EAGAIN:
                        /* The owner was exiting, try again. */
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        cond_resched();
                        goto retry;
                default:
@@ -1349,9 +1404,9 @@ out_unlock:
                drop_futex_key_refs(&key1);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        if (pi_state != NULL)
                free_pi_state(pi_state);
@@ -1360,10 +1415,10 @@ out:
 /* The key must be already stored in q->key. */
 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+        __acquires(&hb->lock)
 {
        struct futex_hash_bucket *hb;
-        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1373,9 +1428,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
-        drop_futex_key_refs(&q->key);
 }
 /**
@@ -1391,6 +1446,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 * an example).
 */
 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+        __releases(&hb->lock)
 {
        int prio;
@@ -1405,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        prio = min(current->normal_prio, MAX_RT_PRIO);
        plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.spinlock = &hb->lock;
-#endif
        plist_add(&q->list, &hb->chain);
        q->task = current;
        spin_unlock(&hb->lock);
@@ -1452,8 +1505,7 @@ retry:
                        spin_unlock(lock_ptr);
                        goto retry;
                }
-                WARN_ON(plist_node_empty(&q->list));
+                __unqueue_futex(q);
-                plist_del(&q->list, &q->list.plist);
                BUG_ON(q->pi_state);
@@ -1471,17 +1523,15 @@ retry:
 * and dropped here.
 */
 static void unqueue_me_pi(struct futex_q *q)
+        __releases(q->lock_ptr)
 {
-        WARN_ON(plist_node_empty(&q->list));
+        __unqueue_futex(q);
-        plist_del(&q->list, &q->list.plist);
        BUG_ON(!q->pi_state);
        free_pi_state(q->pi_state);
        q->pi_state = NULL;
        spin_unlock(q->lock_ptr);
-        drop_futex_key_refs(&q->key);
 }
 /*
@@ -1491,7 +1541,7 @@ static void unqueue_me_pi(struct futex_q *q)
 * private futexes.
 */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                                struct task_struct *newowner, int fshared)
+                                struct task_struct *newowner)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1505,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        /*
         * We are here either because we stole the rtmutex from the
-         * pending owner or we are the pending owner which failed to
+         * previous highest priority waiter or we are the highest priority
-         * get the rtmutex. We have to replace the pending owner TID
+         * waiter but failed to get the rtmutex the first time.
-         * in the user space variable. This must be atomic as we have
+         * We have to replace the newowner TID in the user space variable.
-         * to preserve the owner died bit here.
+         * This must be atomic as we have to preserve the owner died bit here.
         *
         * Note: We write the user space value _before_ changing the pi_state
         * because we can fault here. Imagine swapped out pages or a fork
@@ -1527,9 +1577,7 @@ retry:
        while (1) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
-                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                if (curval == -EFAULT)
                        goto handle_fault;
                if (curval == uval)
                        break;
@@ -1557,8 +1605,8 @@ retry:
        /*
         * To handle the page fault we need to drop the hash bucket
-         * lock here. That gives the other task (either the pending
+         * lock here. That gives the other task (either the highest priority
-         * owner itself or the task which stole the rtmutex) the
+         * waiter itself or the task which stole the rtmutex) the
         * chance to try the fixup of the pi_state. So once we are
         * back from handling the fault we need to check the pi_state
         * after reacquiring the hash bucket lock and before trying to
@@ -1584,20 +1632,11 @@ handle_fault:
        goto retry;
 }
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED            0x01
-#define FLAGS_CLOCKRT           0x02
-#define FLAGS_HAS_TIMEOUT       0x04
 static long futex_wait_restart(struct restart_block *restart);
 /**
 * fixup_owner() - Post lock pi_state and corner case management
 * @uaddr:      user address of the futex
- * @fshared:    whether the futex is shared (1) or not (0)
 * @q:          futex_q (contains pi_state and access to the rt_mutex)
 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
 *
@@ -1610,8 +1649,7 @@ static long futex_wait_restart(struct restart_block *restart);
 *  0 - success, lock not taken
 * <0 - on error (-EFAULT)
 */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-                       int locked)
 {
        struct task_struct *owner;
        int ret = 0;
@@ -1622,7 +1660,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * did a lock-steal - fix up the PI-state in that case:
                 */
                if (q->pi_state->owner != current)
-                        ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                        ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
@@ -1644,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                /*
                 * pi_state is incorrect, some other task did a lock steal and
                 * we returned due to timeout or signal without taking the
-                 * rt_mutex. Too late. We can access the rt_mutex_owner without
+                 * rt_mutex. Too late.
-                 * locking, as the other task is now blocked on the hash bucket
-                 * lock. Fix the state up.
                 */
+                raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-                ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+                if (!owner)
+                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+                raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
        /*
         * Paranoia check. If we did not take the lock, then we should not be
-         * the owner, nor the pending owner, of the rt_mutex.
+         * the owner of the rt_mutex.
         */
        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1712,7 +1752,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * futex_wait_setup() - Prepare to wait on a futex
 * @uaddr:      the futex userspace address
 * @val:        the expected value
- * @fshared:    whether the futex is shared (1) or not (0)
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @q:          the associated futex_q
 * @hb:         storage for hash_bucket pointer to be returned to caller
 *
@@ -1725,7 +1765,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *  0 - uaddr contains val and hb has been locked
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
 */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
 {
        u32 uval;
@@ -1740,17 +1780,17 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         *
         * The basic logical guarantee of a futex is that it blocks ONLY
         * if cond(var) is known to be true at the time of blocking, for
-         * any cond.  If we queued after testing *uaddr, that would open
+         * any cond.  If we locked the hash-bucket after testing *uaddr, that
-         * a race condition where we could block indefinitely with
+         * would open a race condition where we could block indefinitely with
         * cond(var) false, which would violate the guarantee.
         *
-         * A consequence is that futex_wait() can return zero and absorb
+         * On the other hand, we insert q and release the hash-bucket only
-         * a wakeup when *uaddr != val on entry to the syscall.  This is
+         * after testing *uaddr.  This guarantees that futex_wait() will NOT
-         * rare, but normal.
+         * absorb a wakeup if *uaddr does not match the desired values
+         * while the syscall executes.
         */
 retry:
-        q->key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
-        ret = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(ret != 0))
                return ret;
@@ -1766,10 +1806,10 @@ retry_private:
                if (ret)
                        goto out;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
                goto retry;
        }
@@ -1780,40 +1820,40 @@ retry_private:
 out:
        if (ret)
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
        return ret;
 }
-static int futex_wait(u32 __user *uaddr, int fshared,
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+                      ktime_t *abs_time, u32 bitset)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int ret;
        if (!bitset)
                return -EINVAL;
-        q.pi_state = NULL;
        q.bitset = bitset;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
        }
 retry:
-        /* Prepare to wait on uaddr. */
+        /*
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+         * Prepare to wait on uaddr. On success, holds hb lock and increments
+         * q.key refs.
+         */
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out;
@@ -1822,42 +1862,34 @@ retry:
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
+        /* unqueue_me() drops q.key ref */
        if (!unqueue_me(&q))
-                goto out_put_key;
+                goto out;
        ret = -ETIMEDOUT;
        if (to && !to->task)
-                goto out_put_key;
+                goto out;
        /*
         * We expect signal_pending(current), but we might be the
         * victim of a spurious wakeup as well.
         */
-        if (!signal_pending(current)) {
+        if (!signal_pending(current))
-                put_futex_key(fshared, &q.key);
                goto retry;
-        }
        ret = -ERESTARTSYS;
        if (!abs_time)
-                goto out_put_key;
+                goto out;
        restart = &current_thread_info()->restart_block;
        restart->fn = futex_wait_restart;
-        restart->futex.uaddr = (u32 *)uaddr;
+        restart->futex.uaddr = uaddr;
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = FLAGS_HAS_TIMEOUT;
+        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
-        if (fshared)
-                restart->futex.flags |= FLAGS_SHARED;
-        if (clockrt)
-                restart->futex.flags |= FLAGS_CLOCKRT;
        ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
-        put_futex_key(fshared, &q.key);
 out:
        if (to) {
                hrtimer_cancel(&to->timer);
@@ -1869,8 +1901,7 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
-        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+        u32 __user *uaddr = restart->futex.uaddr;
-        int fshared = 0;
        ktime_t t, *tp = NULL;
        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1878,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart)
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;
-        if (restart->futex.flags & FLAGS_SHARED)
-                fshared = 1;
+        return (long)futex_wait(uaddr, restart->futex.flags,
-        return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
+                                restart->futex.val, tp, restart->futex.bitset);
-                                restart->futex.bitset,
-                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1892,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
-                         int detect, ktime_t *time, int trylock)
+                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (refill_pi_state_cache())
@@ -1911,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                hrtimer_set_expires(&to->timer, *time);
        }
-        q.pi_state = NULL;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
 retry:
-        q.key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
-        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out;
@@ -1938,7 +1963,7 @@ retry_private:
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                        put_futex_key(fshared, &q.key);
+                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
                default:
@@ -1968,7 +1993,7 @@ retry_private:
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
-        res = fixup_owner(uaddr, fshared, &q, !ret);
+        res = fixup_owner(uaddr, &q, !ret);
        /*
         * If fixup_owner() returned an error, proprogate that.  If it acquired
         * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1992,7 +2017,7 @@ out_unlock_put_key:
        queue_unlock(&q, hb);
 out_put_key:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -2005,10 +2030,10 @@ uaddr_faulted:
        if (ret)
                goto out_put_key;
-        if (!fshared)
+        if (!(flags & FLAGS_SHARED))
                goto retry_private;
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
        goto retry;
 }
@@ -2017,13 +2042,13 @@ uaddr_faulted:
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
 */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-        u32 uval;
        struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
+        u32 uval, vpid = task_pid_vnr(current);
        int ret;
 retry:
@@ -2032,10 +2057,10 @@ retry:
        /*
         * We release only a lock we actually own:
         */
-        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+        if ((uval & FUTEX_TID_MASK) != vpid)
                return -EPERM;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -2047,17 +2072,14 @@ retry:
         * again. If it succeeds then we can return without waking
         * anyone else up:
         */
-        if (!(uval & FUTEX_OWNER_DIED))
+        if (!(uval & FUTEX_OWNER_DIED) &&
-                uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
+            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
-        if (unlikely(uval == -EFAULT))
                goto pi_faulted;
        /*
         * Rare case: we managed to release the lock atomically,
         * no need to wake anyone else up:
         */
-        if (unlikely(uval == task_pid_vnr(current)))
+        if (unlikely(uval == vpid))
                goto out_unlock;
        /*
@@ -2090,14 +2112,14 @@ retry:
 out_unlock:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 pi_faulted:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
        ret = fault_in_user_writeable(uaddr);
        if (!ret)
@@ -2142,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 * We were woken prior to requeue by a timeout or a signal.
                 * Unqueue the futex_q and determine which it was.
                 */
-                plist_del(&q->list, &q->list.plist);
+                plist_del(&q->list, &hb->chain);
                /* Handle spurious wakeups gracefully */
                ret = -EWOULDBLOCK;
@@ -2157,7 +2179,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
 * @uaddr:      the futex we initially wait on (non-pi)
- * @fshared:    whether the futexes are shared (1) or not (0).  They must be
+ * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
 *              the same type, no requeueing from private to shared, etc.
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
@@ -2195,16 +2217,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *  0 - On success
 * <0 - On error
 */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
-                                 int clockrt, u32 __user *uaddr2)
+                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
-        union futex_key key2;
+        union futex_key key2 = FUTEX_KEY_INIT;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (!bitset)
@@ -2212,8 +2234,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -2226,18 +2249,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
-        key2 = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
-        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
-        q.pi_state = NULL;
        q.bitset = bitset;
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
-        /* Prepare to wait on uaddr. */
+        /*
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+         * count.
+         */
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2254,7 +2278,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * In order for us to be here, we know our q.key == key2, and since
         * we took the hb->lock above, we also know that futex_requeue() has
         * completed and we no longer have to concern ourselves with a wakeup
-         * race with the atomic proxy lock acquition by the requeue code.
+         * race with the atomic proxy lock acquisition by the requeue code. The
+         * futex_requeue dropped our key1 reference and incremented our key2
+         * reference count.
         */
        /* Check if the requeue code acquired the second futex for us. */
@@ -2265,8 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 */
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
-                        ret = fixup_pi_state_owner(uaddr2, &q, current,
+                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                                                   fshared);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2285,7 +2310,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
                 */
-                res = fixup_owner(uaddr2, fshared, &q, !ret);
+                res = fixup_owner(uaddr2, &q, !ret);
                /*
                 * If fixup_owner() returned an error, proprogate that.  If it
                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2316,9 +2341,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        }
 out_put_keys:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out_key2:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out:
        if (to) {
@@ -2393,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                    !capable(CAP_SYS_PTRACE))
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->robust_list;
                rcu_read_unlock();
        }
@@ -2435,11 +2469,20 @@ retry:
                 * userspace.
                 */
                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+                /*
+                 * We are not holding a lock here, but we want to have
-                if (nval == -EFAULT)
+                 * the pagefault_disable/enable() protection because
-                        return -1;
+                 * we want to handle the fault gracefully. If the
+                 * access fails we try to fault in the futex with R/W
+                 * verification via get_user_pages. get_user() above
+                 * does not guarantee R/W access. If that fails we
+                 * give up and leave the futex locked.
+                 */
+                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+                        if (fault_in_user_writeable(uaddr))
+                                return -1;
+                        goto retry;
+                }
                if (nval != uval)
                        goto retry;
@@ -2458,7 +2501,7 @@ retry:
 */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
                                     struct robust_list __user * __user *head,
-                                     int *pi)
+                                     unsigned int *pi)
 {
        unsigned long uentry;
@@ -2481,7 +2524,8 @@ void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+        unsigned int uninitialized_var(next_pi);
        unsigned long futex_offset;
        int rc;
@@ -2542,58 +2586,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-        int clockrt, ret = -ENOSYS;
+        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
-        int cmd = op & FUTEX_CMD_MASK;
+        unsigned int flags = 0;
-        int fshared = 0;
        if (!(op & FUTEX_PRIVATE_FLAG))
-                fshared = 1;
+                flags |= FLAGS_SHARED;
-        clockrt = op & FUTEX_CLOCK_REALTIME;
+        if (op & FUTEX_CLOCK_REALTIME) {
-        if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                flags |= FLAGS_CLOCKRT;
-                return -ENOSYS;
+                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                        return -ENOSYS;
+        }
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-                ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+                ret = futex_wait(uaddr, flags, val, timeout, val3);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-                ret = futex_wake(uaddr, fshared, val, val3);
+                ret = futex_wake(uaddr, flags, val, val3);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-                                    0);
                break;
        case FUTEX_WAKE_OP:
-                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_unlock_pi(uaddr, fshared);
+                        ret = futex_unlock_pi(uaddr, flags);
                break;
        case FUTEX_TRYLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                            clockrt, uaddr2);
+                                            uaddr2);
                break;
        case FUTEX_CMP_REQUEUE_PI:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-                                    1);
                break;
        default:
                ret = -ENOSYS;
@@ -2647,11 +2690,10 @@ static int __init futex_init(void)
         * of the complex code paths. Also we want to prevent
         * registration of robust lists in that case. NULL is
         * guaranteed to fault and we get -EFAULT on functional
-         * implementation, the non functional ones will return
+         * implementation, the non-functional ones will return
         * -ENOSYS.
         */
-        curval = cmpxchg_futex_value_locked(NULL, 0, 0);
+        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
-        if (curval == -EFAULT)
                futex_cmpxchg_enabled = 1;
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
 */
 static inline int
 fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-                   compat_uptr_t __user *head, int *pi)
+                   compat_uptr_t __user *head, unsigned int *pi)
 {
        if (get_user(*uentry, head))
                return -EFAULT;
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+        unsigned int uninitialized_var(next_pi);
        compat_uptr_t uentry, next_uentry, upending;
        compat_long_t futex_offset;
        int rc;
@@ -152,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                    !capable(CAP_SYS_PTRACE))
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->compat_robust_list;
                rcu_read_unlock();
        }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..5bf924d80b5c 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
        bool "Enable gcov-based kernel profiling"
-        depends on DEBUG_FS && CONSTRUCTORS
+        depends on DEBUG_FS
+        select CONSTRUCTORS
        default n
        ---help---
        This option enables gcov-based code profiling (e.g. for code coverage
@@ -34,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index f83972b16564..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
 static const struct file_operations gcov_reset_fops = {
        .write  = reset_write,
        .read   = reset_read,
+        .llseek = noop_llseek,
 };
 /*
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!capable(CAP_SETGID))
+        if (!nsown_capable(CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb49883b64e5..11e896903828 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -55,11 +55,10 @@
 /*
 * The timer bases:
 *
- * Note: If we want to add new timer bases, we have to skip the two
+ * There are more clockids then hrtimer bases. Thus, we index
- * clock ids captured by the cpu-timers. We do this by holding empty
+ * into the timer bases by the hrtimer_base_type enum. When trying
- * entries rather than doing math adjustment of the clock ids.
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
- * This ensures that we capture erroneous accesses to these clock ids
+ * is used to convert from clockid to the proper hrtimer_base_type.
- * rather than moving them into the range of valid clock id's.
 */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
@@ -67,39 +66,55 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
        .clock_base =
        {
                {
-                        .index = CLOCK_REALTIME,
+                        .index = HRTIMER_BASE_MONOTONIC,
+                        .clockid = CLOCK_MONOTONIC,
+                        .get_time = &ktime_get,
+                        .resolution = KTIME_LOW_RES,
+                },
+                {
+                        .index = HRTIMER_BASE_REALTIME,
+                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                        .resolution = KTIME_LOW_RES,
                },
                {
-                        .index = CLOCK_MONOTONIC,
+                        .index = HRTIMER_BASE_BOOTTIME,
-                        .get_time = &ktime_get,
+                        .clockid = CLOCK_BOOTTIME,
+                        .get_time = &ktime_get_boottime,
                        .resolution = KTIME_LOW_RES,
                },
        }
 };
+static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
+        [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
+        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
+};
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+        return hrtimer_clock_to_base_table[clock_id];
+}
 /*
 * Get the coarse grained time at the softirq based on xtime and
 * wall_to_monotonic.
 */
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
-        ktime_t xtim, tomono;
+        ktime_t xtim, mono, boot;
-        struct timespec xts, tom;
+        struct timespec xts, tom, slp;
-        unsigned long seq;
-        do {
+        get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
-                seq = read_seqbegin(&xtime_lock);
-                xts = __current_kernel_time();
-                tom = __get_wall_to_monotonic();
-        } while (read_seqretry(&xtime_lock, seq));
        xtim = timespec_to_ktime(xts);
-        tomono = timespec_to_ktime(tom);
+        mono = ktime_add(xtim, timespec_to_ktime(tom));
-        base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
+        boot = ktime_add(mono, timespec_to_ktime(slp));
-        base->clock_base[CLOCK_MONOTONIC].softirq_time =
+        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-                ktime_add(xtim, tomono);
+        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+        base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
 }
 /*
@@ -186,10 +201,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
        struct hrtimer_cpu_base *new_cpu_base;
        int this_cpu = smp_processor_id();
        int cpu = hrtimer_get_target(this_cpu, pinned);
+        int basenum = base->index;
 again:
        new_cpu_base = &per_cpu(hrtimer_bases, cpu);
-        new_base = &new_cpu_base->clock_base[base->index];
+        new_base = &new_cpu_base->clock_base[basenum];
        if (base != new_base) {
                /*
@@ -336,6 +352,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
 static struct debug_obj_descr hrtimer_debug_descr;
+static void *hrtimer_debug_hint(void *addr)
+{
+        return ((struct hrtimer *) addr)->function;
+}
 /*
 * fixup_init is called when:
 * - an active object is initialized
@@ -395,6 +416,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr hrtimer_debug_descr = {
        .name           = "hrtimer",
+        .debug_hint     = hrtimer_debug_hint,
        .fixup_init     = hrtimer_fixup_init,
        .fixup_activate = hrtimer_fixup_activate,
        .fixup_free     = hrtimer_fixup_free,
@@ -499,7 +521,7 @@ static inline int hrtimer_is_hres_enabled(void)
 */
 static inline int hrtimer_hres_active(void)
 {
-        return __get_cpu_var(hrtimer_bases).hres_active;
+        return __this_cpu_read(hrtimer_bases.hres_active);
 }
 /*
@@ -518,10 +540,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                struct hrtimer *timer;
+                struct timerqueue_node *next;
-                if (!base->first)
+                next = timerqueue_getnext(&base->active);
+                if (!next)
                        continue;
-                timer = rb_entry(base->first, struct hrtimer, node);
+                timer = container_of(next, struct hrtimer, node);
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                /*
                 * clock_was_set() has changed base->offset so the
@@ -601,67 +626,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        return res;
 }
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
-        struct hrtimer_cpu_base *base;
-        struct timespec realtime_offset, wtm;
-        unsigned long seq;
-        if (!hrtimer_hres_active())
-                return;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                wtm = __get_wall_to_monotonic();
-        } while (read_seqretry(&xtime_lock, seq));
-        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-        base = &__get_cpu_var(hrtimer_bases);
-        /* Adjust CLOCK_REALTIME offset */
-        raw_spin_lock(&base->lock);
-        base->clock_base[CLOCK_REALTIME].offset =
-                timespec_to_ktime(realtime_offset);
-        hrtimer_force_reprogram(base, 0);
-        raw_spin_unlock(&base->lock);
-}
-/*
- * Clock realtime was set
- *
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
- *
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
- */
-void clock_was_set(void)
-{
-        /* Retrigger the CPU local events everywhere */
-        on_each_cpu(retrigger_next_event, NULL, 1);
-}
-/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
- */
-void hres_timers_resume(void)
-{
-        WARN_ONCE(!irqs_disabled(),
-                  KERN_INFO "hres_timers_resume() called with IRQs enabled!");
-        retrigger_next_event(NULL);
-}
 /*
 * Initialize the high resolution related parts of cpu_base
 */
@@ -672,14 +636,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 }
 /*
- * Initialize the high resolution related parts of a hrtimer
- */
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
-{
-}
-/*
 * When High resolution timers are active, try to reprogram. Note, that in case
 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -704,11 +660,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 }
 /*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        struct timespec realtime_offset, xtim, wtm, sleep;
+        if (!hrtimer_hres_active())
+                return;
+        /* Optimized out for !HIGH_RES */
+        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+        /* Adjust CLOCK_REALTIME offset */
+        raw_spin_lock(&base->lock);
+        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+                timespec_to_ktime(realtime_offset);
+        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+                timespec_to_ktime(sleep);
+        hrtimer_force_reprogram(base, 0);
+        raw_spin_unlock(&base->lock);
+}
+/*
 * Switch to high resolution mode
 */
 static int hrtimer_switch_to_hres(void)
 {
-        int cpu = smp_processor_id();
+        int i, cpu = smp_processor_id();
        struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
        unsigned long flags;
@@ -724,8 +708,8 @@ static int hrtimer_switch_to_hres(void)
                return 0;
        }
        base->hres_active = 1;
-        base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-        base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+                base->clock_base[i].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
@@ -749,10 +733,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline void retrigger_next_event(void *arg) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+        /* Retrigger the CPU local events everywhere */
+        on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+        timerfd_clock_was_set();
+}
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hrtimers_resume(void)
+{
+        WARN_ONCE(!irqs_disabled(),
+                  KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+        retrigger_next_event(NULL);
+        timerfd_clock_was_set();
+}
 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
 {
 #ifdef CONFIG_TIMER_STATS
@@ -842,48 +859,18 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
 static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base)
 {
-        struct rb_node **link = &base->active.rb_node;
-        struct rb_node *parent = NULL;
-        struct hrtimer *entry;
-        int leftmost = 1;
        debug_activate(timer);
-        /*
+        timerqueue_add(&base->active, &timer->node);
-         * Find the right place in the rbtree:
+        base->cpu_base->active_bases |= 1 << base->index;
-         */
-        while (*link) {
-                parent = *link;
-                entry = rb_entry(parent, struct hrtimer, node);
-                /*
-                 * We dont care about collisions. Nodes with
-                 * the same expiry time stay together.
-                 */
-                if (hrtimer_get_expires_tv64(timer) <
-                                hrtimer_get_expires_tv64(entry)) {
-                        link = &(*link)->rb_left;
-                } else {
-                        link = &(*link)->rb_right;
-                        leftmost = 0;
-                }
-        }
-        /*
-         * Insert the timer to the rbtree and check whether it
-         * replaces the first pending timer
-         */
-        if (leftmost)
-                base->first = &timer->node;
-        rb_link_node(&timer->node, parent, link);
-        rb_insert_color(&timer->node, &base->active);
        /*
         * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
         * state of a possibly running callback.
         */
        timer->state |= HRTIMER_STATE_ENQUEUED;
-        return leftmost;
+        return (&timer->node == base->active.next);
 }
 /*
@@ -903,12 +890,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
        if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                goto out;
-        /*
+        if (&timer->node == timerqueue_getnext(&base->active)) {
-         * Remove the timer from the rbtree and replace the first
-         * entry pointer if necessary.
-         */
-        if (base->first == &timer->node) {
-                base->first = rb_next(&timer->node);
 #ifdef CONFIG_HIGH_RES_TIMERS
                /* Reprogram the clock event device. if enabled */
                if (reprogram && hrtimer_hres_active()) {
@@ -921,7 +903,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
                }
 #endif
        }
-        rb_erase(&timer->node, &base->active);
+        timerqueue_del(&base->active, &timer->node);
+        if (!timerqueue_getnext(&base->active))
+                base->cpu_base->active_bases &= ~(1 << base->index);
 out:
        timer->state = newstate;
 }
@@ -1222,11 +1206,13 @@ ktime_t hrtimer_get_next_event(void)
        if (!hrtimer_hres_active()) {
                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                        struct hrtimer *timer;
+                        struct timerqueue_node *next;
-                        if (!base->first)
+                        next = timerqueue_getnext(&base->active);
+                        if (!next)
                                continue;
-                        timer = rb_entry(base->first, struct hrtimer, node);
+                        timer = container_of(next, struct hrtimer, node);
                        delta.tv64 = hrtimer_get_expires_tv64(timer);
                        delta = ktime_sub(delta, base->get_time());
                        if (delta.tv64 < mindelta.tv64)
@@ -1246,6 +1232,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
 {
        struct hrtimer_cpu_base *cpu_base;
+        int base;
        memset(timer, 0, sizeof(struct hrtimer));
@@ -1254,8 +1241,9 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
                clock_id = CLOCK_MONOTONIC;
-        timer->base = &cpu_base->clock_base[clock_id];
+        base = hrtimer_clockid_to_base(clock_id);
-        hrtimer_init_timer_hres(timer);
+        timer->base = &cpu_base->clock_base[base];
+        timerqueue_init(&timer->node);
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
@@ -1289,9 +1277,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
        struct hrtimer_cpu_base *cpu_base;
+        int base = hrtimer_clockid_to_base(which_clock);
        cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-        *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+        *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
        return 0;
 }
@@ -1346,7 +1335,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 void hrtimer_interrupt(struct clock_event_device *dev)
 {
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-        struct hrtimer_clock_base *base;
        ktime_t expires_next, now, entry_time, delta;
        int i, retries = 0;
@@ -1368,18 +1356,21 @@ retry:
         */
        cpu_base->expires_next.tv64 = KTIME_MAX;
-        base = cpu_base->clock_base;
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+                struct hrtimer_clock_base *base;
+                struct timerqueue_node *node;
                ktime_t basenow;
-                struct rb_node *node;
+                if (!(cpu_base->active_bases & (1 << i)))
+                        continue;
+                base = cpu_base->clock_base + i;
                basenow = ktime_add(now, base->offset);
-                while ((node = base->first)) {
+                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
-                        timer = rb_entry(node, struct hrtimer, node);
+                        timer = container_of(node, struct hrtimer, node);
                        /*
                         * The immediate goal for using the softexpires is
@@ -1406,7 +1397,6 @@ retry:
                        __run_hrtimer(timer, &basenow);
                }
-                base++;
        }
        /*
@@ -1535,7 +1525,7 @@ void hrtimer_run_pending(void)
 */
 void hrtimer_run_queues(void)
 {
-        struct rb_node *node;
+        struct timerqueue_node *node;
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
        int index, gettime = 1;
@@ -1545,8 +1535,7 @@ void hrtimer_run_queues(void)
        for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
                base = &cpu_base->clock_base[index];
+                if (!timerqueue_getnext(&base->active))
-                if (!base->first)
                        continue;
                if (gettime) {
@@ -1556,10 +1545,10 @@ void hrtimer_run_queues(void)
                raw_spin_lock(&cpu_base->lock);
-                while ((node = base->first)) {
+                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
-                        timer = rb_entry(node, struct hrtimer, node);
+                        timer = container_of(node, struct hrtimer, node);
                        if (base->softirq_time.tv64 <=
                                        hrtimer_get_expires_tv64(timer))
                                break;
@@ -1638,7 +1627,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
        struct timespec __user  *rmtp;
        int ret = 0;
-        hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+        hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
                                HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
@@ -1690,7 +1679,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        restart = &current_thread_info()->restart_block;
        restart->fn = hrtimer_nanosleep_restart;
-        restart->nanosleep.index = t.timer.base->index;
+        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.rmtp = rmtp;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
@@ -1724,8 +1713,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        raw_spin_lock_init(&cpu_base->lock);
-        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
+                timerqueue_init_head(&cpu_base->clock_base[i].active);
+        }
        hrtimer_init_hres(cpu_base);
        INIT_LIST_HEAD(&cpu_base->to_pull);
@@ -1737,10 +1728,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
 {
        struct hrtimer *timer;
-        struct rb_node *node;
+        struct timerqueue_node *node;
-        while ((node = rb_first(&old_base->active))) {
+        while ((node = timerqueue_getnext(&old_base->active))) {
-                timer = rb_entry(node, struct hrtimer, node);
+                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);
@@ -1869,7 +1860,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
        }
        /*
-         * A NULL parameter means "inifinte"
+         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
 /*
 * Zero means infinite timeout - no checking done:
 */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                        " disables this message.\n");
        sched_show_task(t);
-        __debug_show_held_locks(t);
+        debug_show_held_locks(t);
        touch_nmi_watchdog();
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 * periodically exit the critical section and enter a new one.
 *
 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
 */
 static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
 {
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..d1d051b38e0b
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,74 @@
+# Select this to activate the generic irq options below
+config HAVE_GENERIC_HARDIRQS
+        bool
+if HAVE_GENERIC_HARDIRQS
+menu "IRQ subsystem"
+#
+# Interrupt subsystem related configuration options
+#
+config GENERIC_HARDIRQS
+       def_bool y
+# Options selectable by the architecture code
+# Make sparse irq Kconfig switch below available
+config HAVE_SPARSE_IRQ
+       bool
+# Enable the generic irq autoprobe mechanism
+config GENERIC_IRQ_PROBE
+        bool
+# Use the generic /proc/interrupts implementation
+config GENERIC_IRQ_SHOW
+       bool
+# Print level/edge extra information
+config GENERIC_IRQ_SHOW_LEVEL
+       bool
+# Support for delayed migration from interrupt context
+config GENERIC_PENDING_IRQ
+        bool
+# Alpha specific irq affinity mechanism
+config AUTO_IRQ_AFFINITY
+       bool
+# Tasklet based software resend for pending interrupts on enable_irq()
+config HARDIRQS_SW_RESEND
+       bool
+# Preflow handler support for fasteoi (sparc64)
+config IRQ_PREFLOW_FASTEOI
+       bool
+# Edge style eoi based handler (cell)
+config IRQ_EDGE_EOI_HANDLER
+       bool
+# Generic configurable interrupt chip implementation
+config GENERIC_IRQ_CHIP
+       bool
+# Support forced irq threading
+config IRQ_FORCED_THREADING
+       bool
+config SPARSE_IRQ
+        bool "Support sparse irq numbering"
+        depends on HAVE_SPARSE_IRQ
+        ---help---
+          Sparse irq numbering is useful for distro kernels that want
+          to define a high CONFIG_NR_CPUS value but still want to have
+          low kernel memory footprint on smaller machines.
+          ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
+            out the interrupt descriptors in a more NUMA-friendly way. )
+          If you don't know what to do here, say N.
+endmenu
+endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,7 @@
-obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
 /*
 * Autodetection depends on the fact that any interrupt that
 * comes in on to an unassigned handler will get stuck with
- * "IRQ_WAITING" cleared and the interrupt disabled.
+ * "IRQS_WAITING" cleared and the interrupt disabled.
 */
 static DEFINE_MUTEX(probing_active);
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
 {
        struct irq_desc *desc;
        unsigned long mask = 0;
-        unsigned int status;
        int i;
        /*
@@ -46,20 +45,15 @@ unsigned long probe_irq_on(void)
         */
        for_each_irq_desc_reverse(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+                if (!desc->action && irq_settings_can_probe(desc)) {
-                        /*
-                         * An old-style architecture might still have
-                         * the handle_bad_irq handler there:
-                         */
-                        compat_irq_chip_set_default_handler(desc);
                        /*
                         * Some chips need to know about probing in
                         * progress:
                         */
-                        if (desc->chip->set_type)
+                        if (desc->irq_data.chip->irq_set_type)
-                                desc->chip->set_type(i, IRQ_TYPE_PROBE);
+                                desc->irq_data.chip->irq_set_type(&desc->irq_data,
-                        desc->chip->startup(i);
+                                                         IRQ_TYPE_PROBE);
+                        irq_startup(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -74,10 +68,10 @@ unsigned long probe_irq_on(void)
         */
        for_each_irq_desc_reverse(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+                if (!desc->action && irq_settings_can_probe(desc)) {
-                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+                        desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-                        if (desc->chip->startup(i))
+                        if (irq_startup(desc))
-                                desc->status |= IRQ_PENDING;
+                                desc->istate |= IRQS_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -92,13 +86,12 @@ unsigned long probe_irq_on(void)
         */
        for_each_irq_desc(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                status = desc->status;
-                if (status & IRQ_AUTODETECT) {
+                if (desc->istate & IRQS_AUTODETECT) {
                        /* It triggered already - consider it spurious. */
-                        if (!(status & IRQ_WAITING)) {
+                        if (!(desc->istate & IRQS_WAITING)) {
-                                desc->status = status & ~IRQ_AUTODETECT;
+                                desc->istate &= ~IRQS_AUTODETECT;
-                                desc->chip->shutdown(i);
+                                irq_shutdown(desc);
                        } else
                                if (i < 32)
                                        mask |= 1 << i;
@@ -124,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
 */
 unsigned int probe_irq_mask(unsigned long val)
 {
-        unsigned int status, mask = 0;
+        unsigned int mask = 0;
        struct irq_desc *desc;
        int i;
        for_each_irq_desc(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                status = desc->status;
+                if (desc->istate & IRQS_AUTODETECT) {
+                        if (i < 16 && !(desc->istate & IRQS_WAITING))
-                if (status & IRQ_AUTODETECT) {
-                        if (i < 16 && !(status & IRQ_WAITING))
                                mask |= 1 << i;
-                        desc->status = status & ~IRQ_AUTODETECT;
+                        desc->istate &= ~IRQS_AUTODETECT;
-                        desc->chip->shutdown(i);
+                        irq_shutdown(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -168,20 +159,18 @@ int probe_irq_off(unsigned long val)
 {
        int i, irq_found = 0, nr_of_irqs = 0;
        struct irq_desc *desc;
-        unsigned int status;
        for_each_irq_desc(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                status = desc->status;
-                if (status & IRQ_AUTODETECT) {
+                if (desc->istate & IRQS_AUTODETECT) {
-                        if (!(status & IRQ_WAITING)) {
+                        if (!(desc->istate & IRQS_WAITING)) {
                                if (!nr_of_irqs)
                                        irq_found = i;
                                nr_of_irqs++;
                        }
-                        desc->status = status & ~IRQ_AUTODETECT;
+                        desc->istate &= ~IRQS_AUTODETECT;
-                        desc->chip->shutdown(i);
+                        irq_shutdown(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,363 +18,217 @@
 #include "internals.h"
-static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
-{
-        struct irq_desc *desc;
-        unsigned long flags;
-        desc = irq_to_desc(irq);
-        if (!desc) {
-                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-                return;
-        }
-        /* Ensure we don't have left over values from a previous use of this irq */
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->status = IRQ_DISABLED;
-        desc->chip = &no_irq_chip;
-        desc->handle_irq = handle_bad_irq;
-        desc->depth = 1;
-        desc->msi_desc = NULL;
-        desc->handler_data = NULL;
-        if (!keep_chip_data)
-                desc->chip_data = NULL;
-        desc->action = NULL;
-        desc->irq_count = 0;
-        desc->irqs_unhandled = 0;
-#ifdef CONFIG_SMP
-        cpumask_setall(desc->affinity);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-        cpumask_clear(desc->pending_mask);
-#endif
-#endif
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
 /**
- *      dynamic_irq_init - initialize a dynamically allocated irq
+ *      irq_set_chip - set the irq chip for an irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
-{
-        dynamic_irq_init_x(irq, false);
-}
-/**
- *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- *
- *      does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_init_keep_chip_data(unsigned int irq)
-{
-        dynamic_irq_init_x(irq, true);
-}
-static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc) {
-                WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-                return;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        if (desc->action) {
-                raw_spin_unlock_irqrestore(&desc->lock, flags);
-                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
-                        irq);
-                return;
-        }
-        desc->msi_desc = NULL;
-        desc->handler_data = NULL;
-        if (!keep_chip_data)
-                desc->chip_data = NULL;
-        desc->handle_irq = handle_bad_irq;
-        desc->chip = &no_irq_chip;
-        desc->name = NULL;
-        clear_kstat_irqs(desc);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-/**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
-{
-        dynamic_irq_cleanup_x(irq, false);
-}
-/**
- *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
- *      @irq:   irq number to initialize
- *
- *      does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
-{
-        dynamic_irq_cleanup_x(irq, true);
-}
-/**
- *      set_irq_chip - set the irq chip for an irq
 *      @irq:   irq number
 *      @chip:  pointer to irq chip description structure
 */
-int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
                return -EINVAL;
-        }
        if (!chip)
                chip = &no_irq_chip;
-        raw_spin_lock_irqsave(&desc->lock, flags);
+        desc->irq_data.chip = chip;
-        irq_chip_set_defaults(chip);
+        irq_put_desc_unlock(desc, flags);
-        desc->chip = chip;
+        /*
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+         * For !CONFIG_SPARSE_IRQ make the irq show up in
+         * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
+         * already marked, and this call is harmless.
+         */
+        irq_reserve_irq(irq);
        return 0;
 }
-EXPORT_SYMBOL(set_irq_chip);
+EXPORT_SYMBOL(irq_set_chip);
 /**
- *      set_irq_type - set the irq trigger type for an irq
+ *      irq_set_type - set the irq trigger type for an irq
 *      @irq:   irq number
 *      @type:  IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
 */
-int set_irq_type(unsigned int irq, unsigned int type)
+int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
-        int ret = -ENXIO;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        int ret = 0;
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+                return -EINVAL;
-                return -ENODEV;
-        }
        type &= IRQ_TYPE_SENSE_MASK;
-        if (type == IRQ_TYPE_NONE)
+        if (type != IRQ_TYPE_NONE)
-                return 0;
+                ret = __irq_set_trigger(desc, irq, type);
+        irq_put_desc_busunlock(desc, flags);
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        ret = __irq_set_trigger(desc, irq, type);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
-EXPORT_SYMBOL(set_irq_type);
+EXPORT_SYMBOL(irq_set_irq_type);
 /**
- *      set_irq_data - set irq type data for an irq
+ *      irq_set_handler_data - set irq handler data for an irq
 *      @irq:   Interrupt number
 *      @data:  Pointer to interrupt specific data
 *
 *      Set the hardware irq controller data for an irq
 */
-int set_irq_data(unsigned int irq, void *data)
+int irq_set_handler_data(unsigned int irq, void *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install controller data for IRQ%d\n", irq);
                return -EINVAL;
-        }
+        desc->irq_data.handler_data = data;
+        irq_put_desc_unlock(desc, flags);
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->handler_data = data;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
-EXPORT_SYMBOL(set_irq_data);
+EXPORT_SYMBOL(irq_set_handler_data);
 /**
- *      set_irq_msi - set MSI descriptor data for an irq
+ *      irq_set_msi_desc - set MSI descriptor data for an irq
 *      @irq:   Interrupt number
 *      @entry: Pointer to MSI descriptor data
 *
 *      Set the MSI descriptor entry for an irq
 */
-int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install msi data for IRQ%d\n", irq);
                return -EINVAL;
-        }
+        desc->irq_data.msi_desc = entry;
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->msi_desc = entry;
        if (entry)
                entry->irq = irq;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        irq_put_desc_unlock(desc, flags);
        return 0;
 }
 /**
- *      set_irq_chip_data - set irq chip data for an irq
+ *      irq_set_chip_data - set irq chip data for an irq
 *      @irq:   Interrupt number
 *      @data:  Pointer to chip specific data
 *
 *      Set the hardware irq chip data for an irq
 */
-int set_irq_chip_data(unsigned int irq, void *data)
+int irq_set_chip_data(unsigned int irq, void *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install chip data for IRQ%d\n", irq);
-                return -EINVAL;
-        }
-        if (!desc->chip) {
-                printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
                return -EINVAL;
-        }
+        desc->irq_data.chip_data = data;
+        irq_put_desc_unlock(desc, flags);
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->chip_data = data;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
-EXPORT_SYMBOL(set_irq_chip_data);
+EXPORT_SYMBOL(irq_set_chip_data);
-/**
+struct irq_data *irq_get_irq_data(unsigned int irq)
- *      set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
- *
- *      @irq:   Interrupt number
- *      @nest:  0 to clear / 1 to set the IRQ_NESTED_THREAD flag
- *
- *      The IRQ_NESTED_THREAD flag indicates that on
- *      request_threaded_irq() no separate interrupt thread should be
- *      created for the irq as the handler are called nested in the
- *      context of a demultiplexing interrupt handler thread.
- */
-void set_irq_nested_thread(unsigned int irq, int nest)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc)
+        return desc ? &desc->irq_data : NULL;
-                return;
+}
+EXPORT_SYMBOL_GPL(irq_get_irq_data);
-        raw_spin_lock_irqsave(&desc->lock, flags);
+static void irq_state_clr_disabled(struct irq_desc *desc)
-        if (nest)
+{
-                desc->status |= IRQ_NESTED_THREAD;
+        irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
-        else
-                desc->status &= ~IRQ_NESTED_THREAD;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-EXPORT_SYMBOL_GPL(set_irq_nested_thread);
-/*
+static void irq_state_set_disabled(struct irq_desc *desc)
- * default enable function
- */
-static void default_enable(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
-        desc->chip->unmask(irq);
+static void irq_state_clr_masked(struct irq_desc *desc)
-        desc->status &= ~IRQ_MASKED;
+{
+        irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
 }
-/*
+static void irq_state_set_masked(struct irq_desc *desc)
- * default disable function
- */
-static void default_disable(unsigned int irq)
 {
+        irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 }
-/*
+int irq_startup(struct irq_desc *desc)
- * default startup function
- */
-static unsigned int default_startup(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        irq_state_clr_disabled(desc);
+        desc->depth = 0;
-        desc->chip->enable(irq);
+        if (desc->irq_data.chip->irq_startup) {
+                int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+                irq_state_clr_masked(desc);
+                return ret;
+        }
+        irq_enable(desc);
        return 0;
 }
-/*
+void irq_shutdown(struct irq_desc *desc)
- * default shutdown function
- */
-static void default_shutdown(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        irq_state_set_disabled(desc);
+        desc->depth = 1;
+        if (desc->irq_data.chip->irq_shutdown)
+                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+        if (desc->irq_data.chip->irq_disable)
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
+        irq_state_set_masked(desc);
+}
-        desc->chip->mask(irq);
+void irq_enable(struct irq_desc *desc)
-        desc->status |= IRQ_MASKED;
+{
+        irq_state_clr_disabled(desc);
+        if (desc->irq_data.chip->irq_enable)
+                desc->irq_data.chip->irq_enable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
+        irq_state_clr_masked(desc);
 }
-/*
+void irq_disable(struct irq_desc *desc)
- * Fixup enable/disable function pointers
- */
-void irq_chip_set_defaults(struct irq_chip *chip)
 {
-        if (!chip->enable)
+        irq_state_set_disabled(desc);
-                chip->enable = default_enable;
+        if (desc->irq_data.chip->irq_disable) {
-        if (!chip->disable)
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
-                chip->disable = default_disable;
+                irq_state_set_masked(desc);
-        if (!chip->startup)
+        }
-                chip->startup = default_startup;
-        /*
-         * We use chip->disable, when the user provided its own. When
-         * we have default_disable set for chip->disable, then we need
-         * to use default_shutdown, otherwise the irq line is not
-         * disabled on free_irq():
-         */
-        if (!chip->shutdown)
-                chip->shutdown = chip->disable != default_disable ?
-                        chip->disable : default_shutdown;
-        if (!chip->name)
-                chip->name = chip->typename;
-        if (!chip->end)
-                chip->end = dummy_irq_chip.end;
 }
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc *desc)
 {
-        if (desc->chip->mask_ack)
+        if (desc->irq_data.chip->irq_mask_ack)
-                desc->chip->mask_ack(irq);
+                desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
        else {
-                desc->chip->mask(irq);
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
-                if (desc->chip->ack)
+                if (desc->irq_data.chip->irq_ack)
-                        desc->chip->ack(irq);
+                        desc->irq_data.chip->irq_ack(&desc->irq_data);
        }
-        desc->status |= IRQ_MASKED;
+        irq_state_set_masked(desc);
 }
-static inline void mask_irq(struct irq_desc *desc, int irq)
+void mask_irq(struct irq_desc *desc)
 {
-        if (desc->chip->mask) {
+        if (desc->irq_data.chip->irq_mask) {
-                desc->chip->mask(irq);
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
-                desc->status |= IRQ_MASKED;
+                irq_state_set_masked(desc);
        }
 }
-static inline void unmask_irq(struct irq_desc *desc, int irq)
+void unmask_irq(struct irq_desc *desc)
 {
-        if (desc->chip->unmask) {
+        if (desc->irq_data.chip->irq_unmask) {
-                desc->chip->unmask(irq);
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
-                desc->status &= ~IRQ_MASKED;
+                irq_state_clr_masked(desc);
        }
 }
@@ -399,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
                goto out_unlock;
-        desc->status |= IRQ_INPROGRESS;
+        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
        action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -410,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
                note_interrupt(irq, desc, action_ret);
        raw_spin_lock_irq(&desc->lock);
-        desc->status &= ~IRQ_INPROGRESS;
+        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 out_unlock:
        raw_spin_unlock_irq(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_nested_irq);
+static bool irq_check_poll(struct irq_desc *desc)
+{
+        if (!(desc->istate & IRQS_POLL_INPROGRESS))
+                return false;
+        return irq_wait_for_poll(desc);
+}
 /**
 *      handle_simple_irq - Simple and software-decoded IRQs.
 *      @irq:   the interrupt number
@@ -432,32 +293,24 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
 void
 handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
-        struct irqaction *action;
-        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        if (unlikely(desc->status & IRQ_INPROGRESS))
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
-                goto out_unlock;
+                if (!irq_check_poll(desc))
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+                        goto out_unlock;
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
-        action = desc->action;
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
                goto out_unlock;
-        desc->status |= IRQ_INPROGRESS;
+        handle_irq_event(desc);
-        raw_spin_unlock(&desc->lock);
-        action_ret = handle_IRQ_event(irq, action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        raw_spin_lock(&desc->lock);
-        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL_GPL(handle_simple_irq);
 /**
 *      handle_level_irq - Level type irq handler
@@ -472,42 +325,42 @@ out_unlock:
 void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
-        struct irqaction *action;
-        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        mask_ack_irq(desc, irq);
+        mask_ack_irq(desc);
-        if (unlikely(desc->status & IRQ_INPROGRESS))
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
-                goto out_unlock;
+                if (!irq_check_poll(desc))
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+                        goto out_unlock;
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
        /*
         * If its disabled or no action available
         * keep it masked and get out of here
         */
-        action = desc->action;
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
                goto out_unlock;
-        desc->status |= IRQ_INPROGRESS;
+        handle_irq_event(desc);
-        raw_spin_unlock(&desc->lock);
-        action_ret = handle_IRQ_event(irq, action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        raw_spin_lock(&desc->lock);
-        desc->status &= ~IRQ_INPROGRESS;
-        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+        if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
-                unmask_irq(desc, irq);
+                unmask_irq(desc);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+        if (desc->preflow_handler)
+                desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
 /**
 *      handle_fasteoi_irq - irq handler for transparent controllers
 *      @irq:   the interrupt number
@@ -521,42 +374,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
 void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
-        struct irqaction *action;
-        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        if (unlikely(desc->status & IRQ_INPROGRESS))
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
-                goto out;
+                if (!irq_check_poll(desc))
+                        goto out;
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
        /*
         * If its disabled or no action available
         * then mask it and get out of here:
         */
-        action = desc->action;
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
-        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+                desc->istate |= IRQS_PENDING;
-                desc->status |= IRQ_PENDING;
+                mask_irq(desc);
-                mask_irq(desc, irq);
                goto out;
        }
-        desc->status |= IRQ_INPROGRESS;
+        if (desc->istate & IRQS_ONESHOT)
-        desc->status &= ~IRQ_PENDING;
+                mask_irq(desc);
-        raw_spin_unlock(&desc->lock);
-        action_ret = handle_IRQ_event(irq, action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        raw_spin_lock(&desc->lock);
+        preflow_handler(desc);
-        desc->status &= ~IRQ_INPROGRESS;
+        handle_irq_event(desc);
-out:
-        desc->chip->eoi(irq);
+out_eoi:
+        desc->irq_data.chip->irq_eoi(&desc->irq_data);
+out_unlock:
        raw_spin_unlock(&desc->lock);
+        return;
+out:
+        if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
+                goto out_eoi;
+        goto out_unlock;
 }
 /**
@@ -565,7 +416,7 @@ out:
 *      @desc:  the interrupt description structure for this irq
 *
 *      Interrupt occures on the falling and/or rising edge of a hardware
- *      signal. The occurence is latched into the irq controller hardware
+ *      signal. The occurrence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
 *      is handled by the associated event handler. If this happens it
@@ -580,34 +431,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
        raw_spin_lock(&desc->lock);
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        /*
         * If we're currently running this IRQ, or its disabled,
         * we shouldn't process the IRQ. Mark it pending, handle
         * the necessary masking and go out
         */
-        if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+        if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
-                    !desc->action)) {
+                     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
-                desc->status |= (IRQ_PENDING | IRQ_MASKED);
+                if (!irq_check_poll(desc)) {
-                mask_ack_irq(desc, irq);
+                        desc->istate |= IRQS_PENDING;
-                goto out_unlock;
+                        mask_ack_irq(desc);
+                        goto out_unlock;
+                }
        }
        kstat_incr_irqs_this_cpu(irq, desc);
        /* Start handling the irq */
-        if (desc->chip->ack)
+        desc->irq_data.chip->irq_ack(&desc->irq_data);
-                desc->chip->ack(irq);
-        /* Mark the IRQ currently in progress.*/
-        desc->status |= IRQ_INPROGRESS;
        do {
-                struct irqaction *action = desc->action;
+                if (unlikely(!desc->action)) {
-                irqreturn_t action_ret;
+                        mask_irq(desc);
-                if (unlikely(!action)) {
-                        mask_irq(desc, irq);
                        goto out_unlock;
                }
@@ -616,26 +461,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                 * one, we could have masked the irq.
                 * Renable it, if it was not disabled in meantime.
                 */
-                if (unlikely((desc->status &
+                if (unlikely(desc->istate & IRQS_PENDING)) {
-                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
+                        if (!irqd_irq_disabled(&desc->irq_data) &&
-                              (IRQ_PENDING | IRQ_MASKED))) {
+                            irqd_irq_masked(&desc->irq_data))
-                        unmask_irq(desc, irq);
+                                unmask_irq(desc);
                }
-                desc->status &= ~IRQ_PENDING;
+                handle_irq_event(desc);
-                raw_spin_unlock(&desc->lock);
-                action_ret = handle_IRQ_event(irq, action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
-                raw_spin_lock(&desc->lock);
-        } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+        } while ((desc->istate & IRQS_PENDING) &&
+                 !irqd_irq_disabled(&desc->irq_data));
-        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
+#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
+/**
+ *      handle_edge_eoi_irq - edge eoi type IRQ handler
+ *      @irq:   the interrupt number
+ *      @desc:  the interrupt description structure for this irq
+ *
+ * Similar as the above handle_edge_irq, but using eoi and w/o the
+ * mask/unmask logic.
+ */
+void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+        struct irq_chip *chip = irq_desc_get_chip(desc);
+        raw_spin_lock(&desc->lock);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+        /*
+         * If we're currently running this IRQ, or its disabled,
+         * we shouldn't process the IRQ. Mark it pending, handle
+         * the necessary masking and go out
+         */
+        if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+                     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+                if (!irq_check_poll(desc)) {
+                        desc->istate |= IRQS_PENDING;
+                        goto out_eoi;
+                }
+        }
+        kstat_incr_irqs_this_cpu(irq, desc);
+        do {
+                if (unlikely(!desc->action))
+                        goto out_eoi;
+                handle_irq_event(desc);
+        } while ((desc->istate & IRQS_PENDING) &&
+                 !irqd_irq_disabled(&desc->irq_data));
+out_eoi:
+        chip->irq_eoi(&desc->irq_data);
+        raw_spin_unlock(&desc->lock);
+}
+#endif
 /**
 *      handle_percpu_irq - Per CPU local irq handler
 *      @irq:   the interrupt number
@@ -646,115 +531,147 @@ out_unlock:
 void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
-        irqreturn_t action_ret;
+        struct irq_chip *chip = irq_desc_get_chip(desc);
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (desc->chip->ack)
+        if (chip->irq_ack)
-                desc->chip->ack(irq);
+                chip->irq_ack(&desc->irq_data);
-        action_ret = handle_IRQ_event(irq, desc->action);
+        handle_irq_event_percpu(desc, desc->action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        if (desc->chip->eoi)
+        if (chip->irq_eoi)
-                desc->chip->eoi(irq);
+                chip->irq_eoi(&desc->irq_data);
 }
 void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                  const char *name)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install type control for IRQ%d\n", irq);
                return;
-        }
-        if (!handle)
+        if (!handle) {
                handle = handle_bad_irq;
-        else if (desc->chip == &no_irq_chip) {
+        } else {
-                printk(KERN_WARNING "Trying to install %sinterrupt handler "
+                if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
-                       "for IRQ%d\n", is_chained ? "chained " : "", irq);
+                        goto out;
-                /*
-                 * Some ARM implementations install a handler for really dumb
-                 * interrupt hardware without setting an irq_chip. This worked
-                 * with the ARM no_irq_chip but the check in setup_irq would
-                 * prevent us to setup the interrupt at all. Switch it to
-                 * dummy_irq_chip for easy transition.
-                 */
-                desc->chip = &dummy_irq_chip;
        }
-        chip_bus_lock(irq, desc);
-        raw_spin_lock_irqsave(&desc->lock, flags);
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-                if (desc->chip != &no_irq_chip)
+                if (desc->irq_data.chip != &no_irq_chip)
-                        mask_ack_irq(desc, irq);
+                        mask_ack_irq(desc);
-                desc->status |= IRQ_DISABLED;
+                irq_state_set_disabled(desc);
                desc->depth = 1;
        }
        desc->handle_irq = handle;
        desc->name = name;
        if (handle != handle_bad_irq && is_chained) {
-                desc->status &= ~IRQ_DISABLED;
+                irq_settings_set_noprobe(desc);
-                desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+                irq_settings_set_norequest(desc);
-                desc->depth = 0;
+                irq_settings_set_nothread(desc);
-                desc->chip->startup(irq);
+                irq_startup(desc);
        }
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+out:
-        chip_bus_sync_unlock(irq, desc);
+        irq_put_desc_busunlock(desc, flags);
 }
-EXPORT_SYMBOL_GPL(__set_irq_handler);
+EXPORT_SYMBOL_GPL(__irq_set_handler);
 void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
-                         irq_flow_handler_t handle)
+                              irq_flow_handler_t handle, const char *name)
 {
-        set_irq_chip(irq, chip);
+        irq_set_chip(irq, chip);
-        __set_irq_handler(irq, handle, 0, NULL);
+        __irq_set_handler(irq, handle, 0, name);
 }
-void
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
-                              irq_flow_handler_t handle, const char *name)
 {
-        set_irq_chip(irq, chip);
+        unsigned long flags;
-        __set_irq_handler(irq, handle, 0, name);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        if (!desc)
+                return;
+        irq_settings_clr_and_set(desc, clr, set);
+        irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+                   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+        if (irq_settings_has_no_balance_set(desc))
+                irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+        if (irq_settings_is_per_cpu(desc))
+                irqd_set(&desc->irq_data, IRQD_PER_CPU);
+        if (irq_settings_can_move_pcntxt(desc))
+                irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
+        if (irq_settings_is_level(desc))
+                irqd_set(&desc->irq_data, IRQD_LEVEL);
+        irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+        irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(irq_modify_status);
-void set_irq_noprobe(unsigned int irq)
+/**
+ *      irq_cpu_online - Invoke all irq_cpu_online functions.
+ *
+ *      Iterate through all irqs and invoke the chip.irq_cpu_online()
+ *      for each.
+ */
+void irq_cpu_online(void)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc;
+        struct irq_chip *chip;
        unsigned long flags;
+        unsigned int irq;
-        if (!desc) {
+        for_each_active_irq(irq) {
-                printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+                desc = irq_to_desc(irq);
-                return;
+                if (!desc)
-        }
+                        continue;
+                raw_spin_lock_irqsave(&desc->lock, flags);
-        raw_spin_lock_irqsave(&desc->lock, flags);
+                chip = irq_data_get_irq_chip(&desc->irq_data);
-        desc->status |= IRQ_NOPROBE;
+                if (chip && chip->irq_cpu_online &&
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+                    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+                     !irqd_irq_disabled(&desc->irq_data)))
+                        chip->irq_cpu_online(&desc->irq_data);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
+        }
 }
-void set_irq_probe(unsigned int irq)
+/**
+ *      irq_cpu_offline - Invoke all irq_cpu_offline functions.
+ *
+ *      Iterate through all irqs and invoke the chip.irq_cpu_offline()
+ *      for each.
+ */
+void irq_cpu_offline(void)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc;
+        struct irq_chip *chip;
        unsigned long flags;
+        unsigned int irq;
-        if (!desc) {
+        for_each_active_irq(irq) {
-                printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+                desc = irq_to_desc(irq);
-                return;
+                if (!desc)
-        }
+                        continue;
+                raw_spin_lock_irqsave(&desc->lock, flags);
-        raw_spin_lock_irqsave(&desc->lock, flags);
+                chip = irq_data_get_irq_chip(&desc->irq_data);
-        desc->status &= ~IRQ_NOPROBE;
+                if (chip && chip->irq_cpu_offline &&
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+                    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+                     !irqd_irq_disabled(&desc->irq_data)))
+                        chip->irq_cpu_offline(&desc->irq_data);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
+        }
 }
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..97a8bfadc88a
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,45 @@
+/*
+ * Debugging printout:
+ */
+#include <linux/kallsyms.h>
+#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+/* FIXME */
+#define PD(f) do { } while (0)
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+        printk("->handle_irq():  %p, ", desc->handle_irq);
+        print_symbol("%s\n", (unsigned long)desc->handle_irq);
+        printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+        print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+        printk("->action(): %p\n", desc->action);
+        if (desc->action) {
+                printk("->action->handler(): %p, ", desc->action->handler);
+                print_symbol("%s\n", (unsigned long)desc->action->handler);
+        }
+        P(IRQ_LEVEL);
+        P(IRQ_PER_CPU);
+        P(IRQ_NOPROBE);
+        P(IRQ_NOREQUEST);
+        P(IRQ_NOTHREAD);
+        P(IRQ_NOAUTOEN);
+        PS(IRQS_AUTODETECT);
+        PS(IRQS_REPLAY);
+        PS(IRQS_WAITING);
+        PS(IRQS_PENDING);
+        PD(IRQS_INPROGRESS);
+        PD(IRQS_DISABLED);
+        PD(IRQS_MASKED);
+}
+#undef P
+#undef PS
+#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..b5fcd96c7102
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the dummy interrupt chip implementation
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include "internals.h"
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(struct irq_data *data)
+{
+        struct irq_desc *desc = irq_data_to_desc(data);
+        print_irq_desc(data->irq, desc);
+        ack_bad_irq(data->irq);
+}
+/*
+ * NOP functions
+ */
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data)
+{
+        return 0;
+}
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+        .name           = "none",
+        .irq_startup    = noop_ret,
+        .irq_shutdown   = noop,
+        .irq_enable     = noop,
+        .irq_disable    = noop,
+        .irq_ack        = ack_bad,
+};
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+        .name           = "dummy",
+        .irq_startup    = noop_ret,
+        .irq_shutdown   = noop,
+        .irq_enable     = noop,
+        .irq_disable    = noop,
+        .irq_ack        = noop,
+        .irq_mask       = noop,
+        .irq_unmask     = noop,
+};
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..3a2cab407b93
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,368 @@
+/*
+ * Library implementing the most common irq chip callback functions
+ *
+ * Copyright (C) 2011, Thomas Gleixner
+ */
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/syscore_ops.h>
+#include "internals.h"
+static LIST_HEAD(gc_list);
+static DEFINE_RAW_SPINLOCK(gc_lock);
+static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
+{
+        return &container_of(d->chip, struct irq_chip_type, chip)->regs;
+}
+/**
+ * irq_gc_noop - NOOP function
+ * @d: irq_data
+ */
+void irq_gc_noop(struct irq_data *d)
+{
+}
+/**
+ * irq_gc_mask_disable_reg - Mask chip via disable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_mask_disable_reg(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
+        gc->mask_cache &= ~mask;
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_set_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        gc->mask_cache |= mask;
+        irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_clr_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        gc->mask_cache &= ~mask;
+        irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_unmask_enable_reg - Unmask chip via enable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_unmask_enable_reg(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
+        gc->mask_cache |= mask;
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
+ * @d: irq_data
+ */
+void irq_gc_ack_set_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = ~(1 << (d->irq - gc->irq_base));
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
+ * @d: irq_data
+ */
+void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_eoi - EOI interrupt
+ * @d: irq_data
+ */
+void irq_gc_eoi(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_set_wake - Set/clr wake bit for an interrupt
+ * @d: irq_data
+ *
+ * For chips where the wake from suspend functionality is not
+ * configured in a separate register and the wakeup active state is
+ * just stored in a bitmask.
+ */
+int irq_gc_set_wake(struct irq_data *d, unsigned int on)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        if (!(mask & gc->wake_enabled))
+                return -EINVAL;
+        irq_gc_lock(gc);
+        if (on)
+                gc->wake_active |= mask;
+        else
+                gc->wake_active &= ~mask;
+        irq_gc_unlock(gc);
+        return 0;
+}
+/**
+ * irq_alloc_generic_chip - Allocate a generic chip and initialize it
+ * @name:       Name of the irq chip
+ * @num_ct:     Number of irq_chip_type instances associated with this
+ * @irq_base:   Interrupt base nr for this chip
+ * @reg_base:   Register base address (virtual)
+ * @handler:    Default flow handler associated with this chip
+ *
+ * Returns an initialized irq_chip_generic structure. The chip defaults
+ * to the primary (index 0) irq_chip_type and @handler
+ */
+struct irq_chip_generic *
+irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
+                       void __iomem *reg_base, irq_flow_handler_t handler)
+{
+        struct irq_chip_generic *gc;
+        unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+        gc = kzalloc(sz, GFP_KERNEL);
+        if (gc) {
+                raw_spin_lock_init(&gc->lock);
+                gc->num_ct = num_ct;
+                gc->irq_base = irq_base;
+                gc->reg_base = reg_base;
+                gc->chip_types->chip.name = name;
+                gc->chip_types->handler = handler;
+        }
+        return gc;
+}
+/*
+ * Separate lockdep class for interrupt chip which can nest irq_desc
+ * lock.
+ */
+static struct lock_class_key irq_nested_lock_class;
+/**
+ * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
+ * @gc:         Generic irq chip holding all data
+ * @msk:        Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @flags:      Flags for initialization
+ * @clr:        IRQ_* bits to clear
+ * @set:        IRQ_* bits to set
+ *
+ * Set up max. 32 interrupts starting from gc->irq_base. Note, this
+ * initializes all interrupts to the primary irq_chip_type and its
+ * associated handler.
+ */
+void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
+                            enum irq_gc_flags flags, unsigned int clr,
+                            unsigned int set)
+{
+        struct irq_chip_type *ct = gc->chip_types;
+        unsigned int i;
+        raw_spin_lock(&gc_lock);
+        list_add_tail(&gc->list, &gc_list);
+        raw_spin_unlock(&gc_lock);
+        /* Init mask cache ? */
+        if (flags & IRQ_GC_INIT_MASK_CACHE)
+                gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
+        for (i = gc->irq_base; msk; msk >>= 1, i++) {
+                if (!msk & 0x01)
+                        continue;
+                if (flags & IRQ_GC_INIT_NESTED_LOCK)
+                        irq_set_lockdep_class(i, &irq_nested_lock_class);
+                irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+                irq_set_chip_data(i, gc);
+                irq_modify_status(i, clr, set);
+        }
+        gc->irq_cnt = i - gc->irq_base;
+}
+/**
+ * irq_setup_alt_chip - Switch to alternative chip
+ * @d:          irq_data for this interrupt
+ * @type        Flow type to be initialized
+ *
+ * Only to be called from chip->irq_set_type() callbacks.
+ */
+int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        struct irq_chip_type *ct = gc->chip_types;
+        unsigned int i;
+        for (i = 0; i < gc->num_ct; i++, ct++) {
+                if (ct->type & type) {
+                        d->chip = &ct->chip;
+                        irq_data_to_desc(d)->handle_irq = ct->handler;
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
+/**
+ * irq_remove_generic_chip - Remove a chip
+ * @gc:         Generic irq chip holding all data
+ * @msk:        Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @clr:        IRQ_* bits to clear
+ * @set:        IRQ_* bits to set
+ *
+ * Remove up to 32 interrupts starting from gc->irq_base.
+ */
+void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
+                             unsigned int clr, unsigned int set)
+{
+        unsigned int i = gc->irq_base;
+        raw_spin_lock(&gc_lock);
+        list_del(&gc->list);
+        raw_spin_unlock(&gc_lock);
+        for (; msk; msk >>= 1, i++) {
+                if (!msk & 0x01)
+                        continue;
+                /* Remove handler first. That will mask the irq line */
+                irq_set_handler(i, NULL);
+                irq_set_chip(i, &no_irq_chip);
+                irq_set_chip_data(i, NULL);
+                irq_modify_status(i, clr, set);
+        }
+}
+#ifdef CONFIG_PM
+static int irq_gc_suspend(void)
+{
+        struct irq_chip_generic *gc;
+        list_for_each_entry(gc, &gc_list, list) {
+                struct irq_chip_type *ct = gc->chip_types;
+                if (ct->chip.irq_suspend)
+                        ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
+        }
+        return 0;
+}
+static void irq_gc_resume(void)
+{
+        struct irq_chip_generic *gc;
+        list_for_each_entry(gc, &gc_list, list) {
+                struct irq_chip_type *ct = gc->chip_types;
+                if (ct->chip.irq_resume)
+                        ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
+        }
+}
+#else
+#define irq_gc_suspend NULL
+#define irq_gc_resume NULL
+#endif
+static void irq_gc_shutdown(void)
+{
+        struct irq_chip_generic *gc;
+        list_for_each_entry(gc, &gc_list, list) {
+                struct irq_chip_type *ct = gc->chip_types;
+                if (ct->chip.irq_pm_shutdown)
+                        ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
+        }
+}
+static struct syscore_ops irq_gc_syscore_ops = {
+        .suspend = irq_gc_suspend,
+        .resume = irq_gc_resume,
+        .shutdown = irq_gc_shutdown,
+};
+static int __init irq_gc_init_ops(void)
+{
+        register_syscore_ops(&irq_gc_syscore_ops);
+        return 0;
+}
+device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
 */
 #include <linux/irq.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
-#include <linux/rculist.h>
-#include <linux/hash.h>
-#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-struct lock_class_key irq_desc_lock_class;
 /**
 * handle_bad_irq - handle spurious and unhandled irqs
 * @irq:       the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
        ack_bad_irq(irq);
 }
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-static void __init init_irq_default_affinity(void)
-{
-        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-        cpumask_setall(irq_default_affinity);
-}
-#else
-static void __init init_irq_default_affinity(void)
-{
-}
-#endif
-/*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the appropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
- */
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
-#ifdef CONFIG_SPARSE_IRQ
-static struct irq_desc irq_desc_init = {
-        .irq        = -1,
-        .status     = IRQ_DISABLED,
-        .chip       = &no_irq_chip,
-        .handle_irq = handle_bad_irq,
-        .depth      = 1,
-        .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-};
-void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
-{
-        void *ptr;
-        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                           GFP_ATOMIC, node);
-        /*
-         * don't overwite if can not get new one
-         * init_copy_kstat_irqs() could still use old one
-         */
-        if (ptr) {
-                printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
-                desc->kstat_irqs = ptr;
-        }
-}
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
-{
-        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-        raw_spin_lock_init(&desc->lock);
-        desc->irq = irq;
-#ifdef CONFIG_SMP
-        desc->node = node;
-#endif
-        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_kstat_irqs(desc, node, nr_cpu_ids);
-        if (!desc->kstat_irqs) {
-                printk(KERN_ERR "can not alloc kstat_irqs\n");
-                BUG_ON(1);
-        }
-        if (!alloc_desc_masks(desc, node, false)) {
-                printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-                BUG_ON(1);
-        }
-        init_desc_masks(desc);
-        arch_init_chip_data(desc, node);
-}
-/*
- * Protect the sparse_irqs:
- */
-DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
-static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-        radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        return radix_tree_lookup(&irq_desc_tree, irq);
-}
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-        void **ptr;
-        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
-        if (ptr)
-                radix_tree_replace_slot(ptr, desc);
-}
-static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
-        [0 ... NR_IRQS_LEGACY-1] = {
-                .irq        = -1,
-                .status     = IRQ_DISABLED,
-                .chip       = &no_irq_chip,
-                .handle_irq = handle_bad_irq,
-                .depth      = 1,
-                .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-        }
-};
-static unsigned int *kstat_irqs_legacy;
-int __init early_irq_init(void)
-{
-        struct irq_desc *desc;
-        int legacy_count;
-        int node;
-        int i;
-        init_irq_default_affinity();
-         /* initialize nr_irqs based on nr_cpu_ids */
-        arch_probe_nr_irqs();
-        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
-        desc = irq_desc_legacy;
-        legacy_count = ARRAY_SIZE(irq_desc_legacy);
-        node = first_online_node;
-        /* allocate based on nr_cpu_ids */
-        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-                                          sizeof(int), GFP_NOWAIT, node);
-        for (i = 0; i < legacy_count; i++) {
-                desc[i].irq = i;
-#ifdef CONFIG_SMP
-                desc[i].node = node;
-#endif
-                desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
-                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                alloc_desc_masks(&desc[i], node, true);
-                init_desc_masks(&desc[i]);
-                set_irq_desc(i, &desc[i]);
-        }
-        return arch_early_irq_init();
-}
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        struct irq_desc *desc;
-        unsigned long flags;
-        if (irq >= nr_irqs) {
-                WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
-                        irq, nr_irqs);
-                return NULL;
-        }
-        desc = irq_to_desc(irq);
-        if (desc)
-                return desc;
-        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-        /* We have to check it to avoid races with another CPU */
-        desc = irq_to_desc(irq);
-        if (desc)
-                goto out_unlock;
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
-        if (!desc) {
-                printk(KERN_ERR "can not alloc irq_desc\n");
-                BUG_ON(1);
-        }
-        init_one_irq_desc(irq, desc, node);
-        set_irq_desc(irq, desc);
-out_unlock:
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        return desc;
-}
-#else /* !CONFIG_SPARSE_IRQ */
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
-        [0 ... NR_IRQS-1] = {
-                .status = IRQ_DISABLED,
-                .chip = &no_irq_chip,
-                .handle_irq = handle_bad_irq,
-                .depth = 1,
-                .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
-        }
-};
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
-int __init early_irq_init(void)
-{
-        struct irq_desc *desc;
-        int count;
-        int i;
-        init_irq_default_affinity();
-        printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
-        desc = irq_desc;
-        count = ARRAY_SIZE(irq_desc);
-        for (i = 0; i < count; i++) {
-                desc[i].irq = i;
-                alloc_desc_masks(&desc[i], 0, true);
-                init_desc_masks(&desc[i]);
-                desc[i].kstat_irqs = kstat_irqs_all[i];
-        }
-        return arch_early_irq_init();
-}
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        return irq_to_desc(irq);
-}
-#endif /* !CONFIG_SPARSE_IRQ */
-void clear_kstat_irqs(struct irq_desc *desc)
-{
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
-}
-/*
- * What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
- */
-static void ack_bad(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        print_irq_desc(irq, desc);
-        ack_bad_irq(irq);
-}
-/*
- * NOP functions
- */
-static void noop(unsigned int irq)
-{
-}
-static unsigned int noop_ret(unsigned int irq)
-{
-        return 0;
-}
-/*
- * Generic no controller implementation
- */
-struct irq_chip no_irq_chip = {
-        .name           = "none",
-        .startup        = noop_ret,
-        .shutdown       = noop,
-        .enable         = noop,
-        .disable        = noop,
-        .ack            = ack_bad,
-        .end            = noop,
-};
-/*
- * Generic dummy implementation which can be used for
- * real dumb interrupt sources
- */
-struct irq_chip dummy_irq_chip = {
-        .name           = "dummy",
-        .startup        = noop_ret,
-        .shutdown       = noop,
-        .enable         = noop,
-        .disable        = noop,
-        .ack            = noop,
-        .mask           = noop,
-        .unmask         = noop,
-        .end            = noop,
-};
 /*
 * Special, empty irq handler:
 */
@@ -358,31 +51,87 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
               "but no thread function available.", irq, action->name);
 }
-/**
+static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
- * handle_IRQ_event - irq action chain handler
+{
- * @irq:        the interrupt number
+        /*
- * @action:     the interrupt action chain for this irq
+         * Wake up the handler thread for this action. In case the
- *
+         * thread crashed and was killed we just pretend that we
- * Handles the action chain of an irq event
+         * handled the interrupt. The hardirq handler has disabled the
- */
+         * device interrupt, so no irq storm is lurking. If the
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+         * RUNTHREAD bit is already set, nothing to do.
+         */
+        if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+            test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+                return;
+        /*
+         * It's safe to OR the mask lockless here. We have only two
+         * places which write to threads_oneshot: This code and the
+         * irq thread.
+         *
+         * This code is the hard irq context and can never run on two
+         * cpus in parallel. If it ever does we have more serious
+         * problems than this bitmask.
+         *
+         * The irq threads of this irq which clear their "running" bit
+         * in threads_oneshot are serialized via desc->lock against
+         * each other and they are serialized against this code by
+         * IRQS_INPROGRESS.
+         *
+         * Hard irq handler:
+         *
+         *      spin_lock(desc->lock);
+         *      desc->state |= IRQS_INPROGRESS;
+         *      spin_unlock(desc->lock);
+         *      set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+         *      desc->threads_oneshot |= mask;
+         *      spin_lock(desc->lock);
+         *      desc->state &= ~IRQS_INPROGRESS;
+         *      spin_unlock(desc->lock);
+         *
+         * irq thread:
+         *
+         * again:
+         *      spin_lock(desc->lock);
+         *      if (desc->state & IRQS_INPROGRESS) {
+         *              spin_unlock(desc->lock);
+         *              while(desc->state & IRQS_INPROGRESS)
+         *                      cpu_relax();
+         *              goto again;
+         *      }
+         *      if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+         *              desc->threads_oneshot &= ~mask;
+         *      spin_unlock(desc->lock);
+         *
+         * So either the thread waits for us to clear IRQS_INPROGRESS
+         * or we are waiting in the flow handler for desc->lock to be
+         * released before we reach this point. The thread also checks
+         * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+         * threads_oneshot untouched and runs the thread another time.
+         */
+        desc->threads_oneshot |= action->thread_mask;
+        wake_up_process(action->thread);
+}
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
-        irqreturn_t ret, retval = IRQ_NONE;
+        irqreturn_t retval = IRQ_NONE;
-        unsigned int status = 0;
+        unsigned int random = 0, irq = desc->irq_data.irq;
        do {
+                irqreturn_t res;
                trace_irq_handler_entry(irq, action);
-                ret = action->handler(irq, action->dev_id);
+                res = action->handler(irq, action->dev_id);
-                trace_irq_handler_exit(irq, action, ret);
+                trace_irq_handler_exit(irq, action, res);
-                switch (ret) {
+                if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
-                case IRQ_WAKE_THREAD:
+                              irq, action->handler))
-                        /*
+                        local_irq_disable();
-                         * Set result to handled so the spurious check
-                         * does not trigger.
-                         */
-                        ret = IRQ_HANDLED;
+                switch (res) {
+                case IRQ_WAKE_THREAD:
                        /*
                         * Catch drivers which return WAKE_THREAD but
                         * did not set up a thread function
@@ -392,165 +141,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
                                break;
                        }
-                        /*
+                        irq_wake_thread(desc, action);
-                         * Wake up the handler thread for this
-                         * action. In case the thread crashed and was
-                         * killed we just pretend that we handled the
-                         * interrupt. The hardirq handler above has
-                         * disabled the device interrupt, so no irq
-                         * storm is lurking.
-                         */
-                        if (likely(!test_bit(IRQTF_DIED,
-                                             &action->thread_flags))) {
-                                set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-                                wake_up_process(action->thread);
-                        }
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        status |= action->flags;
+                        random |= action->flags;
                        break;
                default:
                        break;
                }
-                retval |= ret;
+                retval |= res;
                action = action->next;
        } while (action);
-        if (status & IRQF_SAMPLE_RANDOM)
+        if (random & IRQF_SAMPLE_RANDOM)
                add_interrupt_randomness(irq);
-        local_irq_disable();
+        if (!noirqdebug)
+                note_interrupt(irq, desc, retval);
        return retval;
 }
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+irqreturn_t handle_irq_event(struct irq_desc *desc)
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq:        the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irqaction *action = desc->action;
-        struct irqaction *action;
+        irqreturn_t ret;
-        unsigned int status;
-        kstat_incr_irqs_this_cpu(irq, desc);
-        if (CHECK_IRQ_PER_CPU(desc->status)) {
-                irqreturn_t action_ret;
-                /*
-                 * No locking required for CPU-local interrupts:
-                 */
-                if (desc->chip->ack)
-                        desc->chip->ack(irq);
-                if (likely(!(desc->status & IRQ_DISABLED))) {
-                        action_ret = handle_IRQ_event(irq, desc->action);
-                        if (!noirqdebug)
-                                note_interrupt(irq, desc, action_ret);
-                }
-                desc->chip->end(irq);
-                return 1;
-        }
-        raw_spin_lock(&desc->lock);
-        if (desc->chip->ack)
-                desc->chip->ack(irq);
-        /*
-         * REPLAY is when Linux resends an IRQ that was dropped earlier
-         * WAITING is used by probe to mark irqs that are being tested
-         */
-        status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-        status |= IRQ_PENDING; /* we _want_ to handle it */
-        /*
-         * If the IRQ is disabled for whatever reason, we cannot
-         * use the action we have.
-         */
-        action = NULL;
-        if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
-                action = desc->action;
-                status &= ~IRQ_PENDING; /* we commit to handling */
-                status |= IRQ_INPROGRESS; /* we are handling it */
-        }
-        desc->status = status;
-        /*
+        desc->istate &= ~IRQS_PENDING;
-         * If there is no IRQ handler or it was disabled, exit early.
+        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-         * Since we set PENDING, if another processor is handling
-         * a different instance of this same irq, the other processor
-         * will take care of it.
-         */
-        if (unlikely(!action))
-                goto out;
-        /*
-         * Edge triggered interrupts need to remember
-         * pending events.
-         * This applies to any hw interrupts that allow a second
-         * instance of the same irq to arrive while we are in do_IRQ
-         * or in the handler. But the code here only handles the _second_
-         * instance of the irq, not the third or fourth. So it is mostly
-         * useful for irq hardware that does not mask cleanly in an
-         * SMP environment.
-         */
-        for (;;) {
-                irqreturn_t action_ret;
-                raw_spin_unlock(&desc->lock);
-                action_ret = handle_IRQ_event(irq, action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
-                raw_spin_lock(&desc->lock);
-                if (likely(!(desc->status & IRQ_PENDING)))
-                        break;
-                desc->status &= ~IRQ_PENDING;
-        }
-        desc->status &= ~IRQ_INPROGRESS;
-out:
-        /*
-         * The ->end() handler has to deal with interrupts which got
-         * disabled while the handler was running.
-         */
-        desc->chip->end(irq);
        raw_spin_unlock(&desc->lock);
-        return 1;
+        ret = handle_irq_event_percpu(desc, action);
-}
-#endif
-void early_init_irq_lock_class(void)
-{
-        struct irq_desc *desc;
-        int i;
-        for_each_irq_desc(i, desc) {
-                lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        }
-}
-unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+        raw_spin_lock(&desc->lock);
-{
+        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-        struct irq_desc *desc = irq_to_desc(irq);
+        return ret;
-        return desc ? desc->kstat_irqs[cpu] : 0;
 }
-EXPORT_SYMBOL(kstat_irqs_cpu);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,95 +1,171 @@
 /*
 * IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
 */
+#include <linux/irqdesc.h>
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS        (NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS        NR_IRQS
+#endif
+#define istate core_internal_state__do_not_mess_with_it
 extern int noirqdebug;
-/* Set default functions for irq_chip structures: */
+/*
-extern void irq_chip_set_defaults(struct irq_chip *chip);
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD  - irq action is force threaded
+ */
+enum {
+        IRQTF_RUNTHREAD,
+        IRQTF_DIED,
+        IRQTF_WARNED,
+        IRQTF_AFFINITY,
+        IRQTF_FORCED_THREAD,
+};
-/* Set default handler: */
+/*
-extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+ * Bit masks for desc->state
+ *
+ * IRQS_AUTODETECT              - autodetection in progress
+ * IRQS_SPURIOUS_DISABLED       - was disabled due to spurious interrupt
+ *                                detection
+ * IRQS_POLL_INPROGRESS         - polling in progress
+ * IRQS_ONESHOT                 - irq is not unmasked in primary handler
+ * IRQS_REPLAY                  - irq is replayed
+ * IRQS_WAITING                 - irq is waiting
+ * IRQS_PENDING                 - irq is pending and replayed later
+ * IRQS_SUSPENDED               - irq is suspended
+ */
+enum {
+        IRQS_AUTODETECT         = 0x00000001,
+        IRQS_SPURIOUS_DISABLED  = 0x00000002,
+        IRQS_POLL_INPROGRESS    = 0x00000008,
+        IRQS_ONESHOT            = 0x00000020,
+        IRQS_REPLAY             = 0x00000040,
+        IRQS_WAITING            = 0x00000080,
+        IRQS_PENDING            = 0x00000200,
+        IRQS_SUSPENDED          = 0x00000800,
+};
+#include "debug.h"
+#include "settings.h"
+#define irq_data_to_desc(data)  container_of(data, struct irq_desc, irq_data)
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern struct lock_class_key irq_desc_lock_class;
+extern int irq_startup(struct irq_desc *desc);
+extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-extern void clear_kstat_irqs(struct irq_desc *desc);
-extern raw_spinlock_t sparse_irq_lock;
-#ifdef CONFIG_SPARSE_IRQ
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
-#endif
+/* Resending of interrupts :*/
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
 extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
 #else
 static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
 static inline void register_handler_proc(unsigned int irq,
                                         struct irqaction *action) { }
 static inline void unregister_handler_proc(unsigned int irq,
                                           struct irqaction *action) { }
 #endif
-extern int irq_select_affinity_usr(unsigned int irq);
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 /* Inline functions for support of irq chips on slow busses */
-static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_lock(struct irq_desc *desc)
+{
+        if (unlikely(desc->irq_data.chip->irq_bus_lock))
+                desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
+}
+static inline void chip_bus_sync_unlock(struct irq_desc *desc)
+{
+        if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
+                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
+}
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+{
+        return __irq_get_desc_lock(irq, flags, true);
+}
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
 {
-        if (unlikely(desc->chip->bus_lock))
+        __irq_put_desc_unlock(desc, flags, true);
-                desc->chip->bus_lock(irq);
 }
-static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
 {
-        if (unlikely(desc->chip->bus_sync_unlock))
+        return __irq_get_desc_lock(irq, flags, false);
-                desc->chip->bus_sync_unlock(irq);
+}
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+        __irq_put_desc_unlock(desc, flags, false);
 }
 /*
- * Debugging printout:
+ * Manipulation functions for irq_data.state
 */
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+        d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+}
-#include <linux/kallsyms.h>
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+        d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+}
-static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
 {
-        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+        d->state_use_accessors &= ~mask;
-                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
-        printk("->handle_irq():  %p, ", desc->handle_irq);
-        print_symbol("%s\n", (unsigned long)desc->handle_irq);
-        printk("->chip(): %p, ", desc->chip);
-        print_symbol("%s\n", (unsigned long)desc->chip);
-        printk("->action(): %p\n", desc->action);
-        if (desc->action) {
-                printk("->action->handler(): %p, ", desc->action->handler);
-                print_symbol("%s\n", (unsigned long)desc->action->handler);
-        }
-        P(IRQ_INPROGRESS);
-        P(IRQ_DISABLED);
-        P(IRQ_PENDING);
-        P(IRQ_REPLAY);
-        P(IRQ_AUTODETECT);
-        P(IRQ_WAITING);
-        P(IRQ_LEVEL);
-        P(IRQ_MASKED);
-#ifdef CONFIG_IRQ_PER_CPU
-        P(IRQ_PER_CPU);
-#endif
-        P(IRQ_NOPROBE);
-        P(IRQ_NOREQUEST);
-        P(IRQ_NOAUTOEN);
 }
-#undef P
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+        d->state_use_accessors |= mask;
+}
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+        return d->state_use_accessors & mask;
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..4c60a50e66b2
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the interrupt descriptor management code
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/radix-tree.h>
+#include <linux/bitmap.h>
+#include "internals.h"
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+#if defined(CONFIG_SMP)
+static void __init init_irq_default_affinity(void)
+{
+        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+        cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+        if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+                return -ENOMEM;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+                free_cpumask_var(desc->irq_data.affinity);
+                return -ENOMEM;
+        }
+#endif
+        return 0;
+}
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+        desc->irq_data.node = node;
+        cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        cpumask_clear(desc->pending_mask);
+#endif
+}
+static inline int desc_node(struct irq_desc *desc)
+{
+        return desc->irq_data.node;
+}
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline int desc_node(struct irq_desc *desc) { return 0; }
+#endif
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+{
+        int cpu;
+        desc->irq_data.irq = irq;
+        desc->irq_data.chip = &no_irq_chip;
+        desc->irq_data.chip_data = NULL;
+        desc->irq_data.handler_data = NULL;
+        desc->irq_data.msi_desc = NULL;
+        irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+        desc->handle_irq = handle_bad_irq;
+        desc->depth = 1;
+        desc->irq_count = 0;
+        desc->irqs_unhandled = 0;
+        desc->name = NULL;
+        for_each_possible_cpu(cpu)
+                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+        desc_smp_init(desc, node);
+}
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+static DEFINE_MUTEX(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+#ifdef CONFIG_SPARSE_IRQ
+static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+static void delete_irq_desc(unsigned int irq)
+{
+        radix_tree_delete(&irq_desc_tree, irq);
+}
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        free_cpumask_var(desc->pending_mask);
+#endif
+        free_cpumask_var(desc->irq_data.affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+static struct irq_desc *alloc_desc(int irq, int node)
+{
+        struct irq_desc *desc;
+        gfp_t gfp = GFP_KERNEL;
+        desc = kzalloc_node(sizeof(*desc), gfp, node);
+        if (!desc)
+                return NULL;
+        /* allocate based on nr_cpu_ids */
+        desc->kstat_irqs = alloc_percpu(unsigned int);
+        if (!desc->kstat_irqs)
+                goto err_desc;
+        if (alloc_masks(desc, gfp, node))
+                goto err_kstat;
+        raw_spin_lock_init(&desc->lock);
+        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+        desc_set_defaults(irq, desc, node);
+        return desc;
+err_kstat:
+        free_percpu(desc->kstat_irqs);
+err_desc:
+        kfree(desc);
+        return NULL;
+}
+static void free_desc(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unregister_irq_proc(irq, desc);
+        mutex_lock(&sparse_irq_lock);
+        delete_irq_desc(irq);
+        mutex_unlock(&sparse_irq_lock);
+        free_masks(desc);
+        free_percpu(desc->kstat_irqs);
+        kfree(desc);
+}
+static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+        struct irq_desc *desc;
+        int i;
+        for (i = 0; i < cnt; i++) {
+                desc = alloc_desc(start + i, node);
+                if (!desc)
+                        goto err;
+                mutex_lock(&sparse_irq_lock);
+                irq_insert_desc(start + i, desc);
+                mutex_unlock(&sparse_irq_lock);
+        }
+        return start;
+err:
+        for (i--; i >= 0; i--)
+                free_desc(start + i);
+        mutex_lock(&sparse_irq_lock);
+        bitmap_clear(allocated_irqs, start, cnt);
+        mutex_unlock(&sparse_irq_lock);
+        return -ENOMEM;
+}
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+        if (nr > IRQ_BITMAP_BITS)
+                return -ENOMEM;
+        nr_irqs = nr;
+        return 0;
+}
+int __init early_irq_init(void)
+{
+        int i, initcnt, node = first_online_node;
+        struct irq_desc *desc;
+        init_irq_default_affinity();
+        /* Let arch update nr_irqs and return the nr of preallocated irqs */
+        initcnt = arch_probe_nr_irqs();
+        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+        if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+                nr_irqs = IRQ_BITMAP_BITS;
+        if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+                initcnt = IRQ_BITMAP_BITS;
+        if (initcnt > nr_irqs)
+                nr_irqs = initcnt;
+        for (i = 0; i < initcnt; i++) {
+                desc = alloc_desc(i, node);
+                set_bit(i, allocated_irqs);
+                irq_insert_desc(i, desc);
+        }
+        return arch_early_irq_init();
+}
+#else /* !CONFIG_SPARSE_IRQ */
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+        [0 ... NR_IRQS-1] = {
+                .handle_irq     = handle_bad_irq,
+                .depth          = 1,
+                .lock           = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
+        }
+};
+int __init early_irq_init(void)
+{
+        int count, i, node = first_online_node;
+        struct irq_desc *desc;
+        init_irq_default_affinity();
+        printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+        desc = irq_desc;
+        count = ARRAY_SIZE(irq_desc);
+        for (i = 0; i < count; i++) {
+                desc[i].kstat_irqs = alloc_percpu(unsigned int);
+                alloc_masks(&desc[i], GFP_KERNEL, node);
+                raw_spin_lock_init(&desc[i].lock);
+                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+                desc_set_defaults(i, &desc[i], node);
+        }
+        return arch_early_irq_init();
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+static void free_desc(unsigned int irq)
+{
+        dynamic_irq_cleanup(irq);
+}
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+        return start;
+}
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+        return -ENOMEM;
+}
+#endif /* !CONFIG_SPARSE_IRQ */
+/**
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq:        The irq number to handle
+ *
+ */
+int generic_handle_irq(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (!desc)
+                return -EINVAL;
+        generic_handle_irq_desc(irq, desc);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq);
+/* Dynamic interrupt handling */
+/**
+ * irq_free_descs - free irq descriptors
+ * @from:       Start of descriptor range
+ * @cnt:        Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+        int i;
+        if (from >= nr_irqs || (from + cnt) > nr_irqs)
+                return;
+        for (i = 0; i < cnt; i++)
+                free_desc(from + i);
+        mutex_lock(&sparse_irq_lock);
+        bitmap_clear(allocated_irqs, from, cnt);
+        mutex_unlock(&sparse_irq_lock);
+}
+EXPORT_SYMBOL_GPL(irq_free_descs);
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq:        Allocate for specific irq number if irq >= 0
+ * @from:       Start the search from this irq number
+ * @cnt:        Number of consecutive irqs to allocate.
+ * @node:       Preferred node on which the irq descriptor should be allocated
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+{
+        int start, ret;
+        if (!cnt)
+                return -EINVAL;
+        if (irq >= 0) {
+                if (from > irq)
+                        return -EINVAL;
+                from = irq;
+        }
+        mutex_lock(&sparse_irq_lock);
+        start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+                                           from, cnt, 0);
+        ret = -EEXIST;
+        if (irq >=0 && start != irq)
+                goto err;
+        if (start + cnt > nr_irqs) {
+                ret = irq_expand_nr_irqs(start + cnt);
+                if (ret)
+                        goto err;
+        }
+        bitmap_set(allocated_irqs, start, cnt);
+        mutex_unlock(&sparse_irq_lock);
+        return alloc_descs(start, cnt, node);
+err:
+        mutex_unlock(&sparse_irq_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_descs);
+/**
+ * irq_reserve_irqs - mark irqs allocated
+ * @from:       mark from irq number
+ * @cnt:        number of irqs to mark
+ *
+ * Returns 0 on success or an appropriate error code
+ */
+int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+{
+        unsigned int start;
+        int ret = 0;
+        if (!cnt || (from + cnt) > nr_irqs)
+                return -EINVAL;
+        mutex_lock(&sparse_irq_lock);
+        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+        if (start == from)
+                bitmap_set(allocated_irqs, start, cnt);
+        else
+                ret = -EEXIST;
+        mutex_unlock(&sparse_irq_lock);
+        return ret;
+}
+/**
+ * irq_get_next_irq - get next allocated irq number
+ * @offset:     where to start the search
+ *
+ * Returns next irq number after offset or nr_irqs if none is found.
+ */
+unsigned int irq_get_next_irq(unsigned int offset)
+{
+        return find_next_bit(allocated_irqs, nr_irqs, offset);
+}
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (desc) {
+                if (bus)
+                        chip_bus_lock(desc);
+                raw_spin_lock_irqsave(&desc->lock, *flags);
+        }
+        return desc;
+}
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        if (bus)
+                chip_bus_sync_unlock(desc);
+}
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq:        irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        desc_set_defaults(irq, desc, desc_node(desc));
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        return desc && desc->kstat_irqs ?
+                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+}
+unsigned int kstat_irqs(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        int cpu;
+        int sum = 0;
+        if (!desc || !desc->kstat_irqs)
+                return 0;
+        for_each_possible_cpu(cpu)
+                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
+        return sum;
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a3..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
 #include "internals.h"
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+static int __init setup_forced_irqthreads(char *arg)
+{
+        force_irqthreads = true;
+        return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *      @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
 void synchronize_irq(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned int status;
+        bool inprogress;
        if (!desc)
                return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
                 * Wait until we're out of the critical section.  This might
                 * give the wrong answer due to the lack of memory barriers.
                 */
-                while (desc->status & IRQ_INPROGRESS)
+                while (irqd_irq_inprogress(&desc->irq_data))
                        cpu_relax();
                /* Ok, that indicated we're done: double-check carefully. */
                raw_spin_lock_irqsave(&desc->lock, flags);
-                status = desc->status;
+                inprogress = irqd_irq_inprogress(&desc->irq_data);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
                /* Oops, that failed? */
-        } while (status & IRQ_INPROGRESS);
+        } while (inprogress);
        /*
         * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+        if (!desc || !irqd_can_balance(&desc->irq_data) ||
-            !desc->chip->set_affinity)
+            !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
                return 0;
        return 1;
@@ -100,66 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
        }
 }
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+        return irqd_can_move_in_process_context(data);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+        return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+        cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+        cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
+static inline bool irq_move_pending(struct irq_data *data) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+{
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        struct irq_desc *desc = irq_data_to_desc(data);
+        int ret = 0;
+        if (!chip || !chip->irq_set_affinity)
+                return -EINVAL;
+        if (irq_can_move_pcntxt(data)) {
+                ret = chip->irq_set_affinity(data, mask, false);
+                switch (ret) {
+                case IRQ_SET_MASK_OK:
+                        cpumask_copy(data->affinity, mask);
+                case IRQ_SET_MASK_OK_NOCOPY:
+                        irq_set_thread_affinity(desc);
+                        ret = 0;
+                }
+        } else {
+                irqd_set_move_pending(data);
+                irq_copy_pending(desc, mask);
+        }
+        if (desc->affinity_notify) {
+                kref_get(&desc->affinity_notify->kref);
+                schedule_work(&desc->affinity_notify->work);
+        }
+        irqd_set(data, IRQD_AFFINITY_SET);
+        return ret;
+}
 /**
 *      irq_set_affinity - Set the irq affinity of a given irq
 *      @irq:           Interrupt to set affinity
- *      @cpumask:       cpumask
+ *      @mask:          cpumask
 *
 */
-int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        int ret;
-        if (!desc->chip->set_affinity)
+        if (!desc)
                return -EINVAL;
        raw_spin_lock_irqsave(&desc->lock, flags);
+        ret =  __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PCNTXT) {
-                if (!desc->chip->set_affinity(irq, cpumask)) {
-                        cpumask_copy(desc->affinity, cpumask);
-                        irq_set_thread_affinity(desc);
-                }
-        }
-        else {
-                desc->status |= IRQ_MOVE_PENDING;
-                cpumask_copy(desc->pending_mask, cpumask);
-        }
-#else
-        if (!desc->chip->set_affinity(irq, cpumask)) {
-                cpumask_copy(desc->affinity, cpumask);
-                irq_set_thread_affinity(desc);
-        }
-#endif
-        desc->status |= IRQ_AFFINITY_SET;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        return 0;
+        return ret;
 }
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        if (!desc)
+                return -EINVAL;
+        desc->affinity_hint = m;
+        irq_put_desc_unlock(desc, flags);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+static void irq_affinity_notify(struct work_struct *work)
+{
+        struct irq_affinity_notify *notify =
+                container_of(work, struct irq_affinity_notify, work);
+        struct irq_desc *desc = irq_to_desc(notify->irq);
+        cpumask_var_t cpumask;
+        unsigned long flags;
+        if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
+                goto out;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        if (irq_move_pending(&desc->irq_data))
+                irq_get_pending(cpumask, desc);
+        else
+                cpumask_copy(cpumask, desc->irq_data.affinity);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        notify->notify(notify, cpumask);
+        free_cpumask_var(cpumask);
+out:
+        kref_put(&notify->kref, notify->release);
+}
+/**
+ *      irq_set_affinity_notifier - control notification of IRQ affinity changes
+ *      @irq:           Interrupt for which to enable/disable notification
+ *      @notify:        Context for notification, or %NULL to disable
+ *                      notification.  Function pointers must be initialised;
+ *                      the other fields will be initialised by this function.
+ *
+ *      Must be called in process context.  Notification may only be enabled
+ *      after the IRQ is allocated and must be disabled before the IRQ is
+ *      freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_affinity_notify *old_notify;
        unsigned long flags;
+        /* The release function is promised process context */
+        might_sleep();
        if (!desc)
                return -EINVAL;
+        /* Complete initialisation of *notify */
+        if (notify) {
+                notify->irq = irq;
+                kref_init(&notify->kref);
+                INIT_WORK(&notify->work, irq_affinity_notify);
+        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->affinity_hint = m;
+        old_notify = desc->affinity_notify;
+        desc->affinity_notify = notify;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        if (old_notify)
+                kref_put(&old_notify->kref, old_notify->release);
        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
 * Generic version of the affinity autoselector.
 */
-static int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
+        struct irq_chip *chip = irq_desc_get_chip(desc);
+        struct cpumask *set = irq_default_affinity;
+        int ret;
+        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
                return 0;
@@ -167,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
         * Preserve an userspace affinity setup, but make sure that
         * one of the targets is online.
         */
-        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+        if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
-                if (cpumask_any_and(desc->affinity, cpu_online_mask)
+                if (cpumask_intersects(desc->irq_data.affinity,
-                    < nr_cpu_ids)
+                                       cpu_online_mask))
-                        goto set_affinity;
+                        set = desc->irq_data.affinity;
                else
-                        desc->status &= ~IRQ_AFFINITY_SET;
+                        irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
        }
-        cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+        cpumask_and(mask, cpu_online_mask, set);
-set_affinity:
+        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
-        desc->chip->set_affinity(irq, desc->affinity);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                cpumask_copy(desc->irq_data.affinity, mask);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_set_thread_affinity(desc);
+        }
        return 0;
 }
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
 {
        return irq_select_affinity(irq);
 }
@@ -191,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
 /*
 * Called when affinity is set via /proc/irq
 */
-int irq_select_affinity_usr(unsigned int irq)
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
        int ret;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        ret = setup_affinity(irq, desc);
+        ret = setup_affinity(irq, desc, mask);
-        if (!ret)
-                irq_set_thread_affinity(desc);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
        return 0;
 }
@@ -218,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
        if (suspend) {
                if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
                        return;
-                desc->status |= IRQ_SUSPENDED;
+                desc->istate |= IRQS_SUSPENDED;
        }
-        if (!desc->depth++) {
+        if (!desc->depth++)
-                desc->status |= IRQ_DISABLED;
+                irq_disable(desc);
-                desc->chip->disable(irq);
+}
-        }
+static int __disable_irq_nosync(unsigned int irq)
+{
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        if (!desc)
+                return -EINVAL;
+        __disable_irq(desc, irq, false);
+        irq_put_desc_busunlock(desc, flags);
+        return 0;
 }
 /**
@@ -240,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 */
 void disable_irq_nosync(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        __disable_irq_nosync(irq);
-        unsigned long flags;
-        if (!desc)
-                return;
-        chip_bus_lock(irq, desc);
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        __disable_irq(desc, irq, false);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -268,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
 */
 void disable_irq(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        if (!__disable_irq_nosync(irq))
-        if (!desc)
-                return;
-        disable_irq_nosync(irq);
-        if (desc->action)
                synchronize_irq(irq);
 }
 EXPORT_SYMBOL(disable_irq);
 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
-        if (resume)
+        if (resume) {
-                desc->status &= ~IRQ_SUSPENDED;
+                if (!(desc->istate & IRQS_SUSPENDED)) {
+                        if (!desc->action)
+                                return;
+                        if (!(desc->action->flags & IRQF_FORCE_RESUME))
+                                return;
+                        /* Pretend that it got disabled ! */
+                        desc->depth++;
+                }
+                desc->istate &= ~IRQS_SUSPENDED;
+        }
        switch (desc->depth) {
        case 0:
@@ -290,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
                break;
        case 1: {
-                unsigned int status = desc->status & ~IRQ_DISABLED;
+                if (desc->istate & IRQS_SUSPENDED)
-                if (desc->status & IRQ_SUSPENDED)
                        goto err_out;
                /* Prevent probing on this irq: */
-                desc->status = status | IRQ_NOPROBE;
+                irq_settings_set_noprobe(desc);
+                irq_enable(desc);
                check_irq_resend(desc, irq);
                /* fall-through */
        }
@@ -313,21 +443,22 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 *      IRQ line is re-enabled.
 *
 *      This function may be called from IRQ context only when
- *      desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ *      desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
 */
 void enable_irq(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        if (!desc)
                return;
+        if (WARN(!desc->irq_data.chip,
+                 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+                goto out;
-        chip_bus_lock(irq, desc);
-        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+out:
-        chip_bus_sync_unlock(irq, desc);
+        irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -336,14 +467,14 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;
-        if (desc->chip->set_wake)
+        if (desc->irq_data.chip->irq_set_wake)
-                ret = desc->chip->set_wake(irq, on);
+                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
        return ret;
 }
 /**
- *      set_irq_wake - control irq power management wakeup
+ *      irq_set_irq_wake - control irq power management wakeup
 *      @irq:   interrupt to control
 *      @on:    enable/disable power management wakeup
 *
@@ -354,23 +485,25 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 *      Wakeup mode lets this IRQ wake the system from sleep
 *      states like "suspend to RAM".
 */
-int set_irq_wake(unsigned int irq, unsigned int on)
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        int ret = 0;
+        if (!desc)
+                return -EINVAL;
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
-        raw_spin_lock_irqsave(&desc->lock, flags);
        if (on) {
                if (desc->wake_depth++ == 0) {
                        ret = set_irq_wake_real(irq, on);
                        if (ret)
                                desc->wake_depth = 0;
                        else
-                                desc->status |= IRQ_WAKEUP;
+                                irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
                }
        } else {
                if (desc->wake_depth == 0) {
@@ -380,14 +513,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
                        if (ret)
                                desc->wake_depth = 1;
                        else
-                                desc->status &= ~IRQ_WAKEUP;
+                                irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
                }
        }
+        irq_put_desc_busunlock(desc, flags);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
-EXPORT_SYMBOL(set_irq_wake);
+EXPORT_SYMBOL(irq_set_irq_wake);
 /*
 * Internal function that tells the architecture code whether a
@@ -396,45 +528,29 @@ EXPORT_SYMBOL(set_irq_wake);
 */
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irqaction *action;
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        int canrequest = 0;
        if (!desc)
                return 0;
-        if (desc->status & IRQ_NOREQUEST)
+        if (irq_settings_can_request(desc)) {
-                return 0;
+                if (desc->action)
+                        if (irqflags & desc->action->flags & IRQF_SHARED)
-        raw_spin_lock_irqsave(&desc->lock, flags);
+                                canrequest =1;
-        action = desc->action;
+        }
-        if (action)
+        irq_put_desc_unlock(desc, flags);
-                if (irqflags & action->flags & IRQF_SHARED)
+        return canrequest;
-                        action = NULL;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        return !action;
-}
-void compat_irq_chip_set_default_handler(struct irq_desc *desc)
-{
-        /*
-         * If the architecture still has not overriden
-         * the flow handler then zap the default. This
-         * should catch incorrect flow-type setting.
-         */
-        if (desc->handle_irq == &handle_bad_irq)
-                desc->handle_irq = NULL;
 }
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
-                unsigned long flags)
+                      unsigned long flags)
 {
-        int ret;
+        struct irq_chip *chip = desc->irq_data.chip;
-        struct irq_chip *chip = desc->chip;
+        int ret, unmask = 0;
-        if (!chip || !chip->set_type) {
+        if (!chip || !chip->irq_set_type) {
                /*
                 * IRQF_TRIGGER_* but the PIC does not support multiple
                 * flow-types?
@@ -444,23 +560,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                return 0;
        }
-        /* caller masked out all except trigger mode flags */
+        flags &= IRQ_TYPE_SENSE_MASK;
-        ret = chip->set_type(irq, flags);
+        if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
-        if (ret)
+                if (!irqd_irq_masked(&desc->irq_data))
-                pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+                        mask_irq(desc);
-                                (int)flags, irq, chip->set_type);
+                if (!irqd_irq_disabled(&desc->irq_data))
-        else {
+                        unmask = 1;
-                if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
-                        flags |= IRQ_LEVEL;
-                /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
-                desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
-                desc->status |= flags;
-                if (chip != desc->chip)
-                        irq_chip_set_defaults(desc->chip);
        }
+        /* caller masked out all except trigger mode flags */
+        ret = chip->irq_set_type(&desc->irq_data, flags);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+                irqd_set(&desc->irq_data, flags);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                flags = irqd_get_trigger_type(&desc->irq_data);
+                irq_settings_set_trigger_mask(desc, flags);
+                irqd_clear(&desc->irq_data, IRQD_LEVEL);
+                irq_settings_clr_level(desc);
+                if (flags & IRQ_TYPE_LEVEL_MASK) {
+                        irq_settings_set_level(desc);
+                        irqd_set(&desc->irq_data, IRQD_LEVEL);
+                }
+                ret = 0;
+                break;
+        default:
+                pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+                       flags, irq, chip->irq_set_type);
+        }
+        if (unmask)
+                unmask_irq(desc);
        return ret;
 }
@@ -504,10 +638,13 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 * handler finished. unmask if the interrupt has not been disabled and
 * is marked MASKED.
 */
-static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+static void irq_finalize_oneshot(struct irq_desc *desc,
+                                 struct irqaction *action, bool force)
 {
+        if (!(desc->istate & IRQS_ONESHOT))
+                return;
 again:
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        raw_spin_lock_irq(&desc->lock);
        /*
@@ -517,26 +654,42 @@ again:
         * The thread is faster done than the hard interrupt handler
         * on the other CPU. If we unmask the irq line then the
         * interrupt can come in again and masks the line, leaves due
-         * to IRQ_INPROGRESS and the irq line is masked forever.
+         * to IRQS_INPROGRESS and the irq line is masked forever.
+         *
+         * This also serializes the state of shared oneshot handlers
+         * versus "desc->threads_onehsot |= action->thread_mask;" in
+         * irq_wake_thread(). See the comment there which explains the
+         * serialization.
         */
-        if (unlikely(desc->status & IRQ_INPROGRESS)) {
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
                raw_spin_unlock_irq(&desc->lock);
-                chip_bus_sync_unlock(irq, desc);
+                chip_bus_sync_unlock(desc);
                cpu_relax();
                goto again;
        }
-        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+        /*
-                desc->status &= ~IRQ_MASKED;
+         * Now check again, whether the thread should run. Otherwise
-                desc->chip->unmask(irq);
+         * we would clear the threads_oneshot bit of this thread which
-        }
+         * was just set.
+         */
+        if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+                goto out_unlock;
+        desc->threads_oneshot &= ~action->thread_mask;
+        if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
+            irqd_irq_masked(&desc->irq_data))
+                unmask_irq(desc);
+out_unlock:
        raw_spin_unlock_irq(&desc->lock);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 #ifdef CONFIG_SMP
 /*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Check whether we need to chasnge the affinity of the interrupt thread.
 */
 static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -556,7 +709,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
        }
        raw_spin_lock_irq(&desc->lock);
-        cpumask_copy(mask, desc->affinity);
+        cpumask_copy(mask, desc->irq_data.affinity);
        raw_spin_unlock_irq(&desc->lock);
        set_cpus_allowed_ptr(current, mask);
@@ -568,14 +721,57 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 /*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static irqreturn_t
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+        irqreturn_t ret;
+        local_bh_disable();
+        ret = action->thread_fn(action->irq, action->dev_id);
+        irq_finalize_oneshot(desc, action, false);
+        local_bh_enable();
+        return ret;
+}
+/*
+ * Interrupts explicitely requested as threaded interupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+                struct irqaction *action)
+{
+        irqreturn_t ret;
+        ret = action->thread_fn(action->irq, action->dev_id);
+        irq_finalize_oneshot(desc, action, false);
+        return ret;
+}
+/*
 * Interrupt handler thread
 */
 static int irq_thread(void *data)
 {
-        struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+        static const struct sched_param param = {
+                .sched_priority = MAX_USER_RT_PRIO/2,
+        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
-        int wake, oneshot = desc->status & IRQ_ONESHOT;
+        irqreturn_t (*handler_fn)(struct irq_desc *desc,
+                        struct irqaction *action);
+        int wake;
+        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+                                        &action->thread_flags))
+                handler_fn = irq_forced_thread_fn;
+        else
+                handler_fn = irq_thread_fn;
        sched_setscheduler(current, SCHED_FIFO, &param);
        current->irqaction = action;
@@ -587,23 +783,23 @@ static int irq_thread(void *data)
                atomic_inc(&desc->threads_active);
                raw_spin_lock_irq(&desc->lock);
-                if (unlikely(desc->status & IRQ_DISABLED)) {
+                if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
                        /*
                         * CHECKME: We might need a dedicated
                         * IRQ_THREAD_PENDING flag here, which
                         * retriggers the thread in check_irq_resend()
-                         * but AFAICT IRQ_PENDING should be fine as it
+                         * but AFAICT IRQS_PENDING should be fine as it
                         * retriggers the interrupt itself --- tglx
                         */
-                        desc->status |= IRQ_PENDING;
+                        desc->istate |= IRQS_PENDING;
                        raw_spin_unlock_irq(&desc->lock);
                } else {
-                        raw_spin_unlock_irq(&desc->lock);
+                        irqreturn_t action_ret;
-                        action->thread_fn(action->irq, action->dev_id);
-                        if (oneshot)
+                        raw_spin_unlock_irq(&desc->lock);
-                                irq_finalize_oneshot(action->irq, desc);
+                        action_ret = handler_fn(desc, action);
+                        if (!noirqdebug)
+                                note_interrupt(action->irq, desc, action_ret);
                }
                wake = atomic_dec_and_test(&desc->threads_active);
@@ -612,6 +808,9 @@ static int irq_thread(void *data)
                        wake_up(&desc->wait_for_threads);
        }
+        /* Prevent a stale desc->threads_oneshot */
+        irq_finalize_oneshot(desc, action, true);
        /*
         * Clear irqaction. Otherwise exit_irq_thread() would make
         * fuzz about an active irq thread going into nirvana.
@@ -626,6 +825,7 @@ static int irq_thread(void *data)
 void exit_irq_thread(void)
 {
        struct task_struct *tsk = current;
+        struct irq_desc *desc;
        if (!tsk->irqaction)
                return;
@@ -634,6 +834,14 @@ void exit_irq_thread(void)
               "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
               tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+        desc = irq_to_desc(tsk->irqaction->irq);
+        /*
+         * Prevent a stale desc->threads_oneshot. Must be called
+         * before setting the IRQTF_DIED flag.
+         */
+        irq_finalize_oneshot(desc, tsk->irqaction, true);
        /*
         * Set the THREAD DIED flag to prevent further wakeups of the
         * soon to be gone threaded handler.
@@ -641,6 +849,22 @@ void exit_irq_thread(void)
        set_bit(IRQTF_DIED, &tsk->irqaction->flags);
 }
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+        if (!force_irqthreads)
+                return;
+        if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+                return;
+        new->flags |= IRQF_ONESHOT;
+        if (!new->thread_fn) {
+                set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+                new->thread_fn = new->handler;
+                new->handler = irq_default_primary_handler;
+        }
+}
 /*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
@@ -650,14 +874,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
        struct irqaction *old, **old_ptr;
        const char *old_name = NULL;
-        unsigned long flags;
+        unsigned long flags, thread_mask = 0;
-        int nested, shared = 0;
+        int ret, nested, shared = 0;
-        int ret;
+        cpumask_var_t mask;
        if (!desc)
                return -EINVAL;
-        if (desc->chip == &no_irq_chip)
+        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
        /*
         * Some drivers like serial.c use request_irq() heavily,
@@ -676,15 +900,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                rand_initialize_irq(irq);
        }
-        /* Oneshot interrupts are not allowed with shared */
-        if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
-                return -EINVAL;
        /*
         * Check whether the interrupt nests into another interrupt
         * thread.
         */
-        nested = desc->status & IRQ_NESTED_THREAD;
+        nested = irq_settings_is_nested_thread(desc);
        if (nested) {
                if (!new->thread_fn)
                        return -EINVAL;
@@ -694,6 +914,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * dummy function which warns when called.
                 */
                new->handler = irq_nested_primary_handler;
+        } else {
+                if (irq_settings_can_thread(desc))
+                        irq_setup_forced_threading(new);
        }
        /*
@@ -717,6 +940,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                new->thread = t;
        }
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+                ret = -ENOMEM;
+                goto out_thread;
+        }
        /*
         * The following block of code has to be executed atomically
         */
@@ -728,32 +956,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * Can't share interrupts unless both agree to and are
                 * the same type (level, edge, polarity). So both flag
                 * fields must have IRQF_SHARED set and the bits which
-                 * set the trigger type must match.
+                 * set the trigger type must match. Also all must
+                 * agree on ONESHOT.
                 */
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
-                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+                    ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
                        old_name = old->name;
                        goto mismatch;
                }
-#if defined(CONFIG_IRQ_PER_CPU)
                /* All handlers must agree on per-cpuness */
                if ((old->flags & IRQF_PERCPU) !=
                    (new->flags & IRQF_PERCPU))
                        goto mismatch;
-#endif
                /* add new interrupt at end of irq queue */
                do {
+                        thread_mask |= old->thread_mask;
                        old_ptr = &old->next;
                        old = *old_ptr;
                } while (old);
                shared = 1;
        }
-        if (!shared) {
+        /*
-                irq_chip_set_defaults(desc->chip);
+         * Setup the thread mask for this irqaction. Unlikely to have
+         * 32 resp 64 irqs sharing one line, but who knows.
+         */
+        if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+                ret = -EBUSY;
+                goto out_mask;
+        }
+        new->thread_mask = 1 << ffz(thread_mask);
+        if (!shared) {
                init_waitqueue_head(&desc->wait_for_threads);
                /* Setup the type (level, edge polarity) if configured: */
@@ -762,42 +999,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                                        new->flags & IRQF_TRIGGER_MASK);
                        if (ret)
-                                goto out_thread;
+                                goto out_mask;
-                } else
+                }
-                        compat_irq_chip_set_default_handler(desc);
-#if defined(CONFIG_IRQ_PER_CPU)
+                desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-                if (new->flags & IRQF_PERCPU)
+                                  IRQS_ONESHOT | IRQS_WAITING);
-                        desc->status |= IRQ_PER_CPU;
+                irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-#endif
-                desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
+                if (new->flags & IRQF_PERCPU) {
-                                  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+                        irqd_set(&desc->irq_data, IRQD_PER_CPU);
+                        irq_settings_set_per_cpu(desc);
+                }
                if (new->flags & IRQF_ONESHOT)
-                        desc->status |= IRQ_ONESHOT;
+                        desc->istate |= IRQS_ONESHOT;
-                if (!(desc->status & IRQ_NOAUTOEN)) {
+                if (irq_settings_can_autoenable(desc))
-                        desc->depth = 0;
+                        irq_startup(desc);
-                        desc->status &= ~IRQ_DISABLED;
+                else
-                        desc->chip->startup(irq);
-                } else
                        /* Undo nested disables: */
                        desc->depth = 1;
                /* Exclude IRQ from balancing if requested */
-                if (new->flags & IRQF_NOBALANCING)
+                if (new->flags & IRQF_NOBALANCING) {
-                        desc->status |= IRQ_NO_BALANCING;
+                        irq_settings_set_no_balancing(desc);
+                        irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+                }
                /* Set default affinity mask once everything is setup */
-                setup_affinity(irq, desc);
+                setup_affinity(irq, desc, mask);
-        } else if ((new->flags & IRQF_TRIGGER_MASK)
+        } else if (new->flags & IRQF_TRIGGER_MASK) {
-                        && (new->flags & IRQF_TRIGGER_MASK)
+                unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
-                                != (desc->status & IRQ_TYPE_SENSE_MASK)) {
+                unsigned int omsk = irq_settings_get_trigger_mask(desc);
-                /* hope the handler works with the actual trigger mode... */
-                pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
+                if (nmsk != omsk)
-                                irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
+                        /* hope the handler works with current  trigger mode */
-                                (int)(new->flags & IRQF_TRIGGER_MASK));
+                        pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+                                   irq, nmsk, omsk);
        }
        new->irq = irq;
@@ -811,8 +1050,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         * Check whether we disabled the irq via the spurious handler
         * before. Reenable it and give it another chance.
         */
-        if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
+        if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
-                desc->status &= ~IRQ_SPURIOUS_DISABLED;
+                desc->istate &= ~IRQS_SPURIOUS_DISABLED;
                __enable_irq(desc, irq, false);
        }
@@ -828,6 +1067,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        register_irq_proc(irq, desc);
        new->dir = NULL;
        register_handler_proc(irq, new);
+        free_cpumask_var(mask);
        return 0;
@@ -842,8 +1082,11 @@ mismatch:
 #endif
        ret = -EBUSY;
-out_thread:
+out_mask:
        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        free_cpumask_var(mask);
+out_thread:
        if (new->thread) {
                struct task_struct *t = new->thread;
@@ -864,9 +1107,14 @@ out_thread:
 */
 int setup_irq(unsigned int irq, struct irqaction *act)
 {
+        int retval;
        struct irq_desc *desc = irq_to_desc(irq);
-        return __setup_irq(irq, desc, act);
+        chip_bus_lock(desc);
+        retval = __setup_irq(irq, desc, act);
+        chip_bus_sync_unlock(desc);
+        return retval;
 }
 EXPORT_SYMBOL_GPL(setup_irq);
@@ -912,18 +1160,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Currently used only by UML, might disappear one day: */
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-        if (desc->chip->release)
+        if (desc->irq_data.chip->release)
-                desc->chip->release(irq, dev_id);
+                desc->irq_data.chip->release(irq, dev_id);
 #endif
        /* If this was the last handler, shut down the IRQ line: */
-        if (!desc->action) {
+        if (!desc->action)
-                desc->status |= IRQ_DISABLED;
+                irq_shutdown(desc);
-                if (desc->chip->shutdown)
-                        desc->chip->shutdown(irq);
-                else
-                        desc->chip->disable(irq);
-        }
 #ifdef CONFIG_SMP
        /* make sure affinity_hint is cleaned up */
@@ -997,9 +1240,14 @@ void free_irq(unsigned int irq, void *dev_id)
        if (!desc)
                return;
-        chip_bus_lock(irq, desc);
+#ifdef CONFIG_SMP
+        if (WARN_ON(desc->affinity_notify))
+                desc->affinity_notify = NULL;
+#endif
+        chip_bus_lock(desc);
        kfree(__free_irq(irq, dev_id));
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(free_irq);
@@ -1067,7 +1315,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (desc->status & IRQ_NOREQUEST)
+        if (!irq_settings_can_request(desc))
                return -EINVAL;
        if (!handler) {
@@ -1086,14 +1334,14 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->name = devname;
        action->dev_id = dev_id;
-        chip_bus_lock(irq, desc);
+        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
-        chip_bus_sync_unlock(irq, desc);
+        chip_bus_sync_unlock(desc);
        if (retval)
                kfree(action);
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
@@ -1142,7 +1390,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (desc->status & IRQ_NESTED_THREAD) {
+        if (irq_settings_is_nested_thread(desc)) {
                ret = request_threaded_irq(irq, NULL, handler,
                                           flags, name, dev_id);
                return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,27 +4,28 @@
 #include "internals.h"
-void move_masked_irq(int irq)
+void irq_move_masked_irq(struct irq_data *idata)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(idata);
+        struct irq_chip *chip = idata->chip;
-        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+        if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
                return;
        /*
         * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
         */
-        if (CHECK_IRQ_PER_CPU(desc->status)) {
+        if (!irqd_can_balance(&desc->irq_data)) {
                WARN_ON(1);
                return;
        }
-        desc->status &= ~IRQ_MOVE_PENDING;
+        irqd_clr_move_pending(&desc->irq_data);
        if (unlikely(cpumask_empty(desc->pending_mask)))
                return;
-        if (!desc->chip->set_affinity)
+        if (!chip->irq_set_affinity)
                return;
        assert_raw_spin_locked(&desc->lock);
@@ -34,7 +35,7 @@ void move_masked_irq(int irq)
         * do the disable, re-program, enable sequence.
         * This is *not* particularly important for level triggered
         * but in a edge trigger case, we might be setting rte
-         * when an active trigger is comming in. This could
+         * when an active trigger is coming in. This could
         * cause some ioapics to mal-function.
         * Being paranoid i guess!
         *
@@ -43,26 +44,34 @@ void move_masked_irq(int irq)
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
                   < nr_cpu_ids))
-                if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+                if (!chip->irq_set_affinity(&desc->irq_data,
-                        cpumask_copy(desc->affinity, desc->pending_mask);
+                                            desc->pending_mask, false)) {
+                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
                        irq_set_thread_affinity(desc);
                }
        cpumask_clear(desc->pending_mask);
 }
-void move_native_irq(int irq)
+void irq_move_irq(struct irq_data *idata)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        bool masked;
-        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+        if (likely(!irqd_is_setaffinity_pending(idata)))
                return;
-        if (unlikely(desc->status & IRQ_DISABLED))
+        if (unlikely(irqd_irq_disabled(idata)))
                return;
-        desc->chip->mask(irq);
+        /*
-        move_masked_irq(irq);
+         * Be careful vs. already masked interrupts. If this is a
-        desc->chip->unmask(irq);
+         * threaded interrupt with ONESHOT set, we can end up with an
+         * interrupt storm.
+         */
+        masked = irqd_irq_masked(idata);
+        if (!masked)
+                idata->chip->irq_mask(idata);
+        irq_move_masked_irq(idata);
+        if (!masked)
+                idata->chip->irq_unmask(idata);
 }
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NUMA irq-desc migration code
- *
- * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
- * the new "home node" of the IRQ.
- */
-#include <linux/irq.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include "internals.h"
-static void init_copy_kstat_irqs(struct irq_desc *old_desc,
-                                 struct irq_desc *desc,
-                                 int node, int nr)
-{
-        init_kstat_irqs(desc, node, nr);
-        if (desc->kstat_irqs != old_desc->kstat_irqs)
-                memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
-                         nr * sizeof(*desc->kstat_irqs));
-}
-static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-        if (old_desc->kstat_irqs == desc->kstat_irqs)
-                return;
-        kfree(old_desc->kstat_irqs);
-        old_desc->kstat_irqs = NULL;
-}
-static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                 struct irq_desc *desc, int node)
-{
-        memcpy(desc, old_desc, sizeof(struct irq_desc));
-        if (!alloc_desc_masks(desc, node, false)) {
-                printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-                                "for migration.\n", irq);
-                return false;
-        }
-        raw_spin_lock_init(&desc->lock);
-        desc->node = node;
-        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
-        init_copy_desc_masks(old_desc, desc);
-        arch_init_copy_chip_data(old_desc, desc, node);
-        return true;
-}
-static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-        free_kstat_irqs(old_desc, desc);
-        free_desc_masks(old_desc, desc);
-        arch_free_chip_data(old_desc, desc);
-}
-static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                                int node)
-{
-        struct irq_desc *desc;
-        unsigned int irq;
-        unsigned long flags;
-        irq = old_desc->irq;
-        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-        /* We have to check it to avoid races with another CPU */
-        desc = irq_to_desc(irq);
-        if (desc && old_desc != desc)
-                goto out_unlock;
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        if (!desc) {
-                printk(KERN_ERR "irq %d: can not get new irq_desc "
-                                "for migration.\n", irq);
-                /* still use old one */
-                desc = old_desc;
-                goto out_unlock;
-        }
-        if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
-                /* still use old one */
-                kfree(desc);
-                desc = old_desc;
-                goto out_unlock;
-        }
-        replace_irq_desc(irq, desc);
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        /* free the old one */
-        free_one_irq_desc(old_desc, desc);
-        kfree(old_desc);
-        return desc;
-out_unlock:
-        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-        return desc;
-}
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
-{
-        /* those static or target node is -1, do not move them */
-        if (desc->irq < NR_IRQS_LEGACY || node == -1)
-                return desc;
-        if (desc->node != node)
-                desc = __real_move_irq_desc(desc, node);
-        return desc;
-}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
 * During system-wide suspend or hibernation device drivers need to be prevented
 * from receiving interrupts and this function is provided for this purpose.
 * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQ_SUSPENDED flag for each of them.
+ * and sets the IRQS_SUSPENDED flag for each of them.
 */
 void suspend_device_irqs(void)
 {
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
        }
        for_each_irq_desc(irq, desc)
-                if (desc->status & IRQ_SUSPENDED)
+                if (desc->istate & IRQS_SUSPENDED)
                        synchronize_irq(irq);
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
 *
 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQ_SUSPENDED flag set.
+ * have the IRQS_SUSPENDED flag set.
 */
 void resume_device_irqs(void)
 {
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
-                if (!(desc->status & IRQ_SUSPENDED))
-                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
        struct irq_desc *desc;
        int irq;
-        for_each_irq_desc(irq, desc)
+        for_each_irq_desc(irq, desc) {
-                if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+                if (irqd_is_wakeup_set(&desc->irq_data)) {
-                        return -EBUSY;
+                        if (desc->istate & IRQS_PENDING)
+                                return -EBUSY;
+                        continue;
+                }
+                /*
+                 * Check the non wakeup interrupts whether they need
+                 * to be masked before finally going into suspend
+                 * state. That's for hardware which has no wakeup
+                 * source configuration facility. The chip
+                 * implementation indicates that with
+                 * IRQCHIP_MASK_ON_SUSPEND.
+                 */
+                if (desc->istate & IRQS_SUSPENDED &&
+                    irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+                        mask_irq(desc);
+        }
        return 0;
 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
 #include "internals.h"
@@ -18,16 +19,19 @@ static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
-static int irq_affinity_proc_show(struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long)m->private);
-        const struct cpumask *mask = desc->affinity;
+        const struct cpumask *mask = desc->irq_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PENDING)
+        if (irqd_is_setaffinity_pending(&desc->irq_data))
                mask = desc->pending_mask;
 #endif
-        seq_cpumask(m, mask);
+        if (type)
+                seq_cpumask_list(m, mask);
+        else
+                seq_cpumask(m, mask);
        seq_putc(m, '\n');
        return 0;
 }
@@ -58,21 +62,34 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
 #endif
 int no_irq_affinity;
-static ssize_t irq_affinity_proc_write(struct file *file,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+        return show_irq_affinity(0, m, v);
+}
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+        return show_irq_affinity(1, m, v);
+}
+static ssize_t write_irq_affinity(int type, struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
        cpumask_var_t new_value;
        int err;
-        if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
+        if (!irq_can_set_affinity(irq) || no_irq_affinity)
-            irq_balancing_disabled(irq))
                return -EIO;
        if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
                return -ENOMEM;
-        err = cpumask_parse_user(buffer, count, new_value);
+        if (type)
+                err = cpumask_parselist_user(buffer, count, new_value);
+        else
+                err = cpumask_parse_user(buffer, count, new_value);
        if (err)
                goto free_cpumask;
@@ -89,7 +106,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
        if (!cpumask_intersects(new_value, cpu_online_mask)) {
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-                err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+                err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
        } else {
                irq_set_affinity(irq, new_value);
                err = count;
@@ -100,11 +117,28 @@ free_cpumask:
        return err;
 }
+static ssize_t irq_affinity_proc_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *pos)
+{
+        return write_irq_affinity(0, file, buffer, count, pos);
+}
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *pos)
+{
+        return write_irq_affinity(1, file, buffer, count, pos);
+}
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+}
 static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
 {
        return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
        .release        = single_release,
 };
+static const struct file_operations irq_affinity_list_proc_fops = {
+        .open           = irq_affinity_list_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = irq_affinity_list_proc_write,
+};
 static int default_affinity_show(struct seq_file *m, void *v)
 {
        seq_cpumask(m, irq_default_affinity);
@@ -185,7 +227,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long) m->private);
-        seq_printf(m, "%d\n", desc->node);
+        seq_printf(m, "%d\n", desc->irq_data.node);
        return 0;
 }
@@ -214,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_spurious_proc_show, NULL);
+        return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
 }
 static const struct file_operations irq_spurious_proc_fops = {
@@ -269,7 +311,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
        char name [MAX_NAMELEN];
-        if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
+        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
                return;
        memset(name, 0, MAX_NAMELEN);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        proc_create_data("affinity_hint", 0400, desc->dir,
                         &irq_affinity_hint_proc_fops, (void *)(long)irq);
+        /* create /proc/irq/<irq>/smp_affinity_list */
+        proc_create_data("smp_affinity_list", 0600, desc->dir,
+                         &irq_affinity_list_proc_fops, (void *)(long)irq);
        proc_create_data("node", 0444, desc->dir,
                         &irq_node_proc_fops, (void *)(long)irq);
 #endif
@@ -297,6 +343,25 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
                         &irq_spurious_proc_fops, (void *)(long)irq);
 }
+void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+        char name [MAX_NAMELEN];
+        if (!root_irq_dir || !desc->dir)
+                return;
+#ifdef CONFIG_SMP
+        remove_proc_entry("smp_affinity", desc->dir);
+        remove_proc_entry("affinity_hint", desc->dir);
+        remove_proc_entry("smp_affinity_list", desc->dir);
+        remove_proc_entry("node", desc->dir);
+#endif
+        remove_proc_entry("spurious", desc->dir);
+        memset(name, 0, MAX_NAMELEN);
+        sprintf(name, "%u", irq);
+        remove_proc_entry(name, root_irq_dir);
+}
 #undef MAX_NAMELEN
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
@@ -339,3 +404,83 @@ void init_irq_proc(void)
        }
 }
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+        return 0;
+}
+#ifndef ACTUAL_NR_IRQS
+# define ACTUAL_NR_IRQS nr_irqs
+#endif
+int show_interrupts(struct seq_file *p, void *v)
+{
+        static int prec;
+        unsigned long flags, any_count = 0;
+        int i = *(loff_t *) v, j;
+        struct irqaction *action;
+        struct irq_desc *desc;
+        if (i > ACTUAL_NR_IRQS)
+                return 0;
+        if (i == ACTUAL_NR_IRQS)
+                return arch_show_interrupts(p, prec);
+        /* print header and calculate the width of the first column */
+        if (i == 0) {
+                for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+                        j *= 10;
+                seq_printf(p, "%*s", prec + 8, "");
+                for_each_online_cpu(j)
+                        seq_printf(p, "CPU%-8d", j);
+                seq_putc(p, '\n');
+        }
+        desc = irq_to_desc(i);
+        if (!desc)
+                return 0;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        for_each_online_cpu(j)
+                any_count |= kstat_irqs_cpu(i, j);
+        action = desc->action;
+        if (!action && !any_count)
+                goto out;
+        seq_printf(p, "%*d: ", prec, i);
+        for_each_online_cpu(j)
+                seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+        if (desc->irq_data.chip) {
+                if (desc->irq_data.chip->irq_print_chip)
+                        desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
+                else if (desc->irq_data.chip->name)
+                        seq_printf(p, " %8s", desc->irq_data.chip->name);
+                else
+                        seq_printf(p, " %8s", "-");
+        } else {
+                seq_printf(p, " %8s", "None");
+        }
+#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
+        seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+#endif
+        if (desc->name)
+                seq_printf(p, "-%-8s", desc->name);
+        if (action) {
+                seq_printf(p, "  %s", action->name);
+                while ((action = action->next) != NULL)
+                        seq_printf(p, ", %s", action->name);
+        }
+        seq_putc(p, '\n');
+out:
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        return 0;
+}
+#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
 #ifdef CONFIG_HARDIRQS_SW_RESEND
 /* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
 /*
 * Run software resends of IRQ's
@@ -55,22 +55,21 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
 */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-        unsigned int status = desc->status;
-        /*
-         * Make sure the interrupt is enabled, before resending it:
-         */
-        desc->chip->enable(irq);
        /*
         * We do not resend level type interrupts. Level type
         * interrupts are resent by hardware when they are still
         * active.
         */
-        if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+        if (irq_settings_is_level(desc))
-                desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+                return;
+        if (desc->istate & IRQS_REPLAY)
+                return;
+        if (desc->istate & IRQS_PENDING) {
+                desc->istate &= ~IRQS_PENDING;
+                desc->istate |= IRQS_REPLAY;
-                if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
+                if (!desc->irq_data.chip->irq_retrigger ||
+                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
                        /* Set it pending and activate the softirq: */
                        set_bit(irq, irqs_resend);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..f1667833d444
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,142 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+        _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
+        _IRQ_PER_CPU            = IRQ_PER_CPU,
+        _IRQ_LEVEL              = IRQ_LEVEL,
+        _IRQ_NOPROBE            = IRQ_NOPROBE,
+        _IRQ_NOREQUEST          = IRQ_NOREQUEST,
+        _IRQ_NOTHREAD           = IRQ_NOTHREAD,
+        _IRQ_NOAUTOEN           = IRQ_NOAUTOEN,
+        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
+        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
+        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
+        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
+};
+#define IRQ_PER_CPU             GOT_YOU_MORON
+#define IRQ_NO_BALANCING        GOT_YOU_MORON
+#define IRQ_LEVEL               GOT_YOU_MORON
+#define IRQ_NOPROBE             GOT_YOU_MORON
+#define IRQ_NOREQUEST           GOT_YOU_MORON
+#define IRQ_NOTHREAD            GOT_YOU_MORON
+#define IRQ_NOAUTOEN            GOT_YOU_MORON
+#define IRQ_NESTED_THREAD       GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK        GOT_YOU_MORON
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+        desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
+        desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
+}
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_PER_CPU;
+}
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_PER_CPU;
+}
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NO_BALANCING;
+}
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_NO_BALANCING;
+}
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
+}
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+        desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
+        desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
+}
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_LEVEL;
+}
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_LEVEL;
+}
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_LEVEL;
+}
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOREQUEST);
+}
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_NOREQUEST;
+}
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NOREQUEST;
+}
+static inline bool irq_settings_can_thread(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOTHREAD);
+}
+static inline void irq_settings_clr_nothread(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_NOTHREAD;
+}
+static inline void irq_settings_set_nothread(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NOTHREAD;
+}
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOPROBE);
+}
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_NOPROBE;
+}
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NOPROBE;
+}
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
+}
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
+}
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_NESTED_THREAD;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,75 +14,100 @@
 #include <linux/moduleparam.h>
 #include <linux/timer.h>
+#include "internals.h"
 static int irqfixup __read_mostly;
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 static void poll_spurious_irqs(unsigned long dummy);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
+/*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+        if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+                      "irq poll in progress on cpu %d for irq %d\n",
+                      smp_processor_id(), desc->irq_data.irq))
+                return false;
+#ifdef CONFIG_SMP
+        do {
+                raw_spin_unlock(&desc->lock);
+                while (irqd_irq_inprogress(&desc->irq_data))
+                        cpu_relax();
+                raw_spin_lock(&desc->lock);
+        } while (irqd_irq_inprogress(&desc->irq_data));
+        /* Might have been disabled in meantime */
+        return !irqd_irq_disabled(&desc->irq_data) && desc->action;
+#else
+        return false;
+#endif
+}
 /*
 * Recovery handler for misrouted interrupts.
 */
-static int try_one_irq(int irq, struct irq_desc *desc)
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
+        irqreturn_t ret = IRQ_NONE;
        struct irqaction *action;
-        int ok = 0, work = 0;
        raw_spin_lock(&desc->lock);
-        /* Already running on another processor */
-        if (desc->status & IRQ_INPROGRESS) {
-                /*
-                 * Already running: If it is shared get the other
-                 * CPU to go looking for our mystery interrupt too
-                 */
-                if (desc->action && (desc->action->flags & IRQF_SHARED))
-                        desc->status |= IRQ_PENDING;
-                raw_spin_unlock(&desc->lock);
-                return ok;
-        }
-        /* Honour the normal IRQ locking */
-        desc->status |= IRQ_INPROGRESS;
-        action = desc->action;
-        raw_spin_unlock(&desc->lock);
-        while (action) {
+        /* PER_CPU and nested thread interrupts are never polled */
-                /* Only shared IRQ handlers are safe to call */
+        if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
-                if (action->flags & IRQF_SHARED) {
+                goto out;
-                        if (action->handler(irq, action->dev_id) ==
-                                IRQ_HANDLED)
-                                ok = 1;
-                }
-                action = action->next;
-        }
-        local_irq_disable();
-        /* Now clean up the flags */
-        raw_spin_lock(&desc->lock);
-        action = desc->action;
        /*
-         * While we were looking for a fixup someone queued a real
+         * Do not poll disabled interrupts unless the spurious
-         * IRQ clashing with our walk:
+         * disabled poller asks explicitely.
         */
-        while ((desc->status & IRQ_PENDING) && action) {
+        if (irqd_irq_disabled(&desc->irq_data) && !force)
+                goto out;
+        /*
+         * All handlers must agree on IRQF_SHARED, so we test just the
+         * first. Check for action->next as well.
+         */
+        action = desc->action;
+        if (!action || !(action->flags & IRQF_SHARED) ||
+            (action->flags & __IRQF_TIMER) || !action->next)
+                goto out;
+        /* Already running on another processor */
+        if (irqd_irq_inprogress(&desc->irq_data)) {
                /*
-                 * Perform real IRQ processing for the IRQ we deferred
+                 * Already running: If it is shared get the other
+                 * CPU to go looking for our mystery interrupt too
                 */
-                work = 1;
+                desc->istate |= IRQS_PENDING;
-                raw_spin_unlock(&desc->lock);
+                goto out;
-                handle_IRQ_event(irq, action);
-                raw_spin_lock(&desc->lock);
-                desc->status &= ~IRQ_PENDING;
        }
-        desc->status &= ~IRQ_INPROGRESS;
-        /*
-         * If we did actual work for the real IRQ line we must let the
-         * IRQ controller clean up too
-         */
-        if (work && desc->chip && desc->chip->end)
-                desc->chip->end(irq);
-        raw_spin_unlock(&desc->lock);
-        return ok;
+        /* Mark it poll in progress */
+        desc->istate |= IRQS_POLL_INPROGRESS;
+        do {
+                if (handle_irq_event(desc) == IRQ_HANDLED)
+                        ret = IRQ_HANDLED;
+                action = desc->action;
+        } while ((desc->istate & IRQS_PENDING) && action);
+        desc->istate &= ~IRQS_POLL_INPROGRESS;
+out:
+        raw_spin_unlock(&desc->lock);
+        return ret == IRQ_HANDLED;
 }
 static int misrouted_irq(int irq)
@@ -90,6 +115,11 @@ static int misrouted_irq(int irq)
        struct irq_desc *desc;
        int i, ok = 0;
+        if (atomic_inc_return(&irq_poll_active) == 1)
+                goto out;
+        irq_poll_cpu = smp_processor_id();
        for_each_irq_desc(i, desc) {
                if (!i)
                         continue;
@@ -97,9 +127,11 @@ static int misrouted_irq(int irq)
                if (i == irq)   /* Already tried */
                        continue;
-                if (try_one_irq(i, desc))
+                if (try_one_irq(i, desc, false))
                        ok = 1;
        }
+out:
+        atomic_dec(&irq_poll_active);
        /* So the caller can adjust the irq error counts */
        return ok;
 }
@@ -109,27 +141,39 @@ static void poll_spurious_irqs(unsigned long dummy)
        struct irq_desc *desc;
        int i;
+        if (atomic_inc_return(&irq_poll_active) != 1)
+                goto out;
+        irq_poll_cpu = smp_processor_id();
        for_each_irq_desc(i, desc) {
-                unsigned int status;
+                unsigned int state;
                if (!i)
                         continue;
                /* Racy but it doesn't matter */
-                status = desc->status;
+                state = desc->istate;
                barrier();
-                if (!(status & IRQ_SPURIOUS_DISABLED))
+                if (!(state & IRQS_SPURIOUS_DISABLED))
                        continue;
                local_irq_disable();
-                try_one_irq(i, desc);
+                try_one_irq(i, desc, true);
                local_irq_enable();
        }
+out:
+        atomic_dec(&irq_poll_active);
        mod_timer(&poll_spurious_irq_timer,
                  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+        if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+                return 0;
+        return 1;
+}
 /*
 * If 99,900 of the previous 100,000 interrupts have not been handled
 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -137,17 +181,15 @@ static void poll_spurious_irqs(unsigned long dummy)
 *
 * (The other 100-of-100,000 interrupts may have been a correctly
 *  functioning device sharing an IRQ with the failing one)
- *
- * Called under desc->lock
 */
 static void
 __report_bad_irq(unsigned int irq, struct irq_desc *desc,
                 irqreturn_t action_ret)
 {
        struct irqaction *action;
+        unsigned long flags;
-        if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+        if (bad_action_ret(action_ret)) {
                printk(KERN_ERR "irq event %d: bogus return value %x\n",
                                irq, action_ret);
        } else {
@@ -157,14 +199,23 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        dump_stack();
        printk(KERN_ERR "handlers:\n");
+        /*
+         * We need to take desc->lock here. note_interrupt() is called
+         * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+         * with something else removing an action. It's ok to take
+         * desc->lock here. See synchronize_irq().
+         */
+        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        while (action) {
-                printk(KERN_ERR "[<%p>]", action->handler);
+                printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
-                print_symbol(" (%s)",
+                if (action->thread_fn)
-                        (unsigned long)action->handler);
+                        printk(KERN_CONT " threaded [<%p>] %pf",
-                printk("\n");
+                                        action->thread_fn, action->thread_fn);
+                printk(KERN_CONT "\n");
                action = action->next;
        }
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 static void
@@ -216,7 +267,19 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
                    irqreturn_t action_ret)
 {
-        if (unlikely(action_ret != IRQ_HANDLED)) {
+        if (desc->istate & IRQS_POLL_INPROGRESS)
+                return;
+        /* we get here again via the threaded handler */
+        if (action_ret == IRQ_WAKE_THREAD)
+                return;
+        if (bad_action_ret(action_ret)) {
+                report_bad_irq(irq, desc, action_ret);
+                return;
+        }
+        if (unlikely(action_ret == IRQ_NONE)) {
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
@@ -228,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                else
                        desc->irqs_unhandled++;
                desc->last_unhandled = jiffies;
-                if (unlikely(action_ret != IRQ_NONE))
-                        report_bad_irq(irq, desc, action_ret);
        }
        if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
@@ -252,9 +313,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                 * Now kill the IRQ
                 */
                printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-                desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+                desc->istate |= IRQS_SPURIOUS_DISABLED;
                desc->depth++;
-                desc->chip->disable(irq);
+                irq_disable(desc);
                mod_timer(&poll_spurious_irq_timer,
                          jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..c58fa7da8aef
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+/*
+ * An entry can be in one of four states:
+ *
+ * free      NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+#define IRQ_WORK_PENDING        1UL
+#define IRQ_WORK_BUSY           2UL
+#define IRQ_WORK_FLAGS          3UL
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+        return (unsigned long)entry->next & flags;
+}
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+        unsigned long next = (unsigned long)entry->next;
+        next &= ~IRQ_WORK_FLAGS;
+        return (struct irq_work *)next;
+}
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+        unsigned long next = (unsigned long)entry;
+        next |= flags;
+        return (struct irq_work *)next;
+}
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+        struct irq_work *next, *nflags;
+        do {
+                next = entry->next;
+                if ((unsigned long)next & IRQ_WORK_PENDING)
+                        return false;
+                nflags = next_flags(next, IRQ_WORK_FLAGS);
+        } while (cmpxchg(&entry->next, next, nflags) != next);
+        return true;
+}
+void __weak arch_irq_work_raise(void)
+{
+        /*
+         * Lame architectures will get the timer tick callback
+         */
+}
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+        struct irq_work *next;
+        preempt_disable();
+        do {
+                next = __this_cpu_read(irq_work_list);
+                /* Can assign non-atomic because we keep the flags set. */
+                entry->next = next_flags(next, IRQ_WORK_FLAGS);
+        } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
+        /* The list was empty, raise self-interrupt to start processing. */
+        if (!irq_work_next(entry))
+                arch_irq_work_raise();
+        preempt_enable();
+}
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+        if (!irq_work_claim(entry)) {
+                /*
+                 * Already enqueued, can't do!
+                 */
+                return false;
+        }
+        __irq_work_queue(entry);
+        return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+        struct irq_work *list;
+        if (this_cpu_read(irq_work_list) == NULL)
+                return;
+        BUG_ON(!in_irq());
+        BUG_ON(!irqs_disabled());
+        list = this_cpu_xchg(irq_work_list, NULL);
+        while (list != NULL) {
+                struct irq_work *entry = list;
+                list = irq_work_next(list);
+                /*
+                 * Clear the PENDING bit, after this point the @entry
+                 * can be re-used.
+                 */
+                entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+                entry->func(entry);
+                /*
+                 * Clear the BUSY bit and return to the free state if
+                 * no-one else claimed it meanwhile.
+                 */
+                (void)cmpxchg(&entry->next,
+                              next_flags(NULL, IRQ_WORK_BUSY),
+                              NULL);
+        }
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+        WARN_ON_ONCE(irqs_disabled());
+        while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+                cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..a8ce45097f3d
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,393 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ */
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+#include <linux/jump_label.h>
+#ifdef HAVE_JUMP_LABEL
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+void jump_label_lock(void)
+{
+        mutex_lock(&jump_label_mutex);
+}
+void jump_label_unlock(void)
+{
+        mutex_unlock(&jump_label_mutex);
+}
+bool jump_label_enabled(struct jump_label_key *key)
+{
+        return !!atomic_read(&key->enabled);
+}
+static int jump_label_cmp(const void *a, const void *b)
+{
+        const struct jump_entry *jea = a;
+        const struct jump_entry *jeb = b;
+        if (jea->key < jeb->key)
+                return -1;
+        if (jea->key > jeb->key)
+                return 1;
+        return 0;
+}
+static void
+jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+        unsigned long size;
+        size = (((unsigned long)stop - (unsigned long)start)
+                                        / sizeof(struct jump_entry));
+        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+static void jump_label_update(struct jump_label_key *key, int enable);
+void jump_label_inc(struct jump_label_key *key)
+{
+        if (atomic_inc_not_zero(&key->enabled))
+                return;
+        jump_label_lock();
+        if (atomic_add_return(1, &key->enabled) == 1)
+                jump_label_update(key, JUMP_LABEL_ENABLE);
+        jump_label_unlock();
+}
+void jump_label_dec(struct jump_label_key *key)
+{
+        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
+                return;
+        jump_label_update(key, JUMP_LABEL_DISABLE);
+        jump_label_unlock();
+}
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+        if (entry->code <= (unsigned long)end &&
+                entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+                return 1;
+        return 0;
+}
+static int __jump_label_text_reserved(struct jump_entry *iter_start,
+                struct jump_entry *iter_stop, void *start, void *end)
+{
+        struct jump_entry *iter;
+        iter = iter_start;
+        while (iter < iter_stop) {
+                if (addr_conflict(iter, start, end))
+                        return 1;
+                iter++;
+        }
+        return 0;
+}
+static void __jump_label_update(struct jump_label_key *key,
+                                struct jump_entry *entry,
+                                struct jump_entry *stop, int enable)
+{
+        for (; (entry < stop) &&
+              (entry->key == (jump_label_t)(unsigned long)key);
+              entry++) {
+                /*
+                 * entry->code set to 0 invalidates module init text sections
+                 * kernel_text_address() verifies we are not in core kernel
+                 * init code, see jump_label_invalidate_module_init().
+                 */
+                if (entry->code && kernel_text_address(entry->code))
+                        arch_jump_label_transform(entry, enable);
+        }
+}
+/*
+ * Not all archs need this.
+ */
+void __weak arch_jump_label_text_poke_early(jump_label_t addr)
+{
+}
+static __init int jump_label_init(void)
+{
+        struct jump_entry *iter_start = __start___jump_table;
+        struct jump_entry *iter_stop = __stop___jump_table;
+        struct jump_label_key *key = NULL;
+        struct jump_entry *iter;
+        jump_label_lock();
+        jump_label_sort_entries(iter_start, iter_stop);
+        for (iter = iter_start; iter < iter_stop; iter++) {
+                arch_jump_label_text_poke_early(iter->code);
+                if (iter->key == (jump_label_t)(unsigned long)key)
+                        continue;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
+                atomic_set(&key->enabled, 0);
+                key->entries = iter;
+#ifdef CONFIG_MODULES
+                key->next = NULL;
+#endif
+        }
+        jump_label_unlock();
+        return 0;
+}
+early_initcall(jump_label_init);
+#ifdef CONFIG_MODULES
+struct jump_label_mod {
+        struct jump_label_mod *next;
+        struct jump_entry *entries;
+        struct module *mod;
+};
+static int __jump_label_mod_text_reserved(void *start, void *end)
+{
+        struct module *mod;
+        mod = __module_text_address((unsigned long)start);
+        if (!mod)
+                return 0;
+        WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+        return __jump_label_text_reserved(mod->jump_entries,
+                                mod->jump_entries + mod->num_jump_entries,
+                                start, end);
+}
+static void __jump_label_mod_update(struct jump_label_key *key, int enable)
+{
+        struct jump_label_mod *mod = key->next;
+        while (mod) {
+                struct module *m = mod->mod;
+                __jump_label_update(key, mod->entries,
+                                    m->jump_entries + m->num_jump_entries,
+                                    enable);
+                mod = mod->next;
+        }
+}
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+        struct jump_entry *iter_start = mod->jump_entries;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+        struct jump_entry *iter;
+        /* if the module doesn't have jump label entries, just return */
+        if (iter_start == iter_stop)
+                return;
+        for (iter = iter_start; iter < iter_stop; iter++)
+                arch_jump_label_text_poke_early(iter->code);
+}
+static int jump_label_add_module(struct module *mod)
+{
+        struct jump_entry *iter_start = mod->jump_entries;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+        struct jump_entry *iter;
+        struct jump_label_key *key = NULL;
+        struct jump_label_mod *jlm;
+        /* if the module doesn't have jump label entries, just return */
+        if (iter_start == iter_stop)
+                return 0;
+        jump_label_sort_entries(iter_start, iter_stop);
+        for (iter = iter_start; iter < iter_stop; iter++) {
+                if (iter->key == (jump_label_t)(unsigned long)key)
+                        continue;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
+                if (__module_address(iter->key) == mod) {
+                        atomic_set(&key->enabled, 0);
+                        key->entries = iter;
+                        key->next = NULL;
+                        continue;
+                }
+                jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
+                if (!jlm)
+                        return -ENOMEM;
+                jlm->mod = mod;
+                jlm->entries = iter;
+                jlm->next = key->next;
+                key->next = jlm;
+                if (jump_label_enabled(key))
+                        __jump_label_update(key, iter, iter_stop,
+                                            JUMP_LABEL_ENABLE);
+        }
+        return 0;
+}
+static void jump_label_del_module(struct module *mod)
+{
+        struct jump_entry *iter_start = mod->jump_entries;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+        struct jump_entry *iter;
+        struct jump_label_key *key = NULL;
+        struct jump_label_mod *jlm, **prev;
+        for (iter = iter_start; iter < iter_stop; iter++) {
+                if (iter->key == (jump_label_t)(unsigned long)key)
+                        continue;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
+                if (__module_address(iter->key) == mod)
+                        continue;
+                prev = &key->next;
+                jlm = key->next;
+                while (jlm && jlm->mod != mod) {
+                        prev = &jlm->next;
+                        jlm = jlm->next;
+                }
+                if (jlm) {
+                        *prev = jlm->next;
+                        kfree(jlm);
+                }
+        }
+}
+static void jump_label_invalidate_module_init(struct module *mod)
+{
+        struct jump_entry *iter_start = mod->jump_entries;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+        struct jump_entry *iter;
+        for (iter = iter_start; iter < iter_stop; iter++) {
+                if (within_module_init(iter->code, mod))
+                        iter->code = 0;
+        }
+}
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+                         void *data)
+{
+        struct module *mod = data;
+        int ret = 0;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                jump_label_lock();
+                ret = jump_label_add_module(mod);
+                if (ret)
+                        jump_label_del_module(mod);
+                jump_label_unlock();
+                break;
+        case MODULE_STATE_GOING:
+                jump_label_lock();
+                jump_label_del_module(mod);
+                jump_label_unlock();
+                break;
+        case MODULE_STATE_LIVE:
+                jump_label_lock();
+                jump_label_invalidate_module_init(mod);
+                jump_label_unlock();
+                break;
+        }
+        return notifier_from_errno(ret);
+}
+struct notifier_block jump_label_module_nb = {
+        .notifier_call = jump_label_module_notify,
+        .priority = 1, /* higher than tracepoints */
+};
+static __init int jump_label_init_module(void)
+{
+        return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(jump_label_init_module);
+#endif /* CONFIG_MODULES */
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+        int ret = __jump_label_text_reserved(__start___jump_table,
+                        __stop___jump_table, start, end);
+        if (ret)
+                return ret;
+#ifdef CONFIG_MODULES
+        ret = __jump_label_mod_text_reserved(start, end);
+#endif
+        return ret;
+}
+static void jump_label_update(struct jump_label_key *key, int enable)
+{
+        struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
+#ifdef CONFIG_MODULES
+        struct module *mod = __module_address((jump_label_t)key);
+        __jump_label_mod_update(key, enable);
+        if (mod)
+                stop = mod->jump_entries + mod->num_jump_entries;
+#endif
+        /* if there are no users, entry can be NULL */
+        if (entry)
+                __jump_label_update(key, entry, stop, enable);
+}
+#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
        if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
            arch_is_kernel_text(addr))
                return 1;
-        return in_gate_area_no_task(addr);
+        return in_gate_area_no_mm(addr);
 }
 static inline int is_kernel(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
                return 1;
-        return in_gate_area_no_task(addr);
+        return in_gate_area_no_mm(addr);
 }
 static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 }
 /* Look up a kernel symbol and return it in a text buffer. */
-int sprint_symbol(char *buffer, unsigned long address)
+static int __sprint_symbol(char *buffer, unsigned long address,
+                           int symbol_offset)
 {
        char *modname;
        const char *name;
        unsigned long offset, size;
        int len;
+        address += symbol_offset;
        name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
        if (!name)
                return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
                strcpy(buffer, name);
        len = strlen(buffer);
        buffer += len;
+        offset -= symbol_offset;
        if (modname)
-                len += sprintf(buffer, "+%#lx/%#lx [%s]",
+                len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
-                                                offset, size, modname);
        else
                len += sprintf(buffer, "+%#lx/%#lx", offset, size);
        return len;
 }
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, 0);
+}
 EXPORT_SYMBOL_GPL(sprint_symbol);
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, -1);
+}
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
 {
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
                 */
                type = iter->exported ? toupper(iter->type) :
                                        tolower(iter->type);
-                seq_printf(m, "%0*lx %c %s\t[%s]\n",
+                seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
-                           (int)(2 * sizeof(void *)),
+                           type, iter->name, iter->module_name);
-                           iter->value, type, iter->name, iter->module_name);
        } else
-                seq_printf(m, "%0*lx %c %s\n",
+                seq_printf(m, "%pK %c %s\n", (void *)iter->value,
-                           (int)(2 * sizeof(void *)),
+                           iter->type, iter->name);
-                           iter->value, iter->type, iter->name);
        return 0;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        /* Initialize the list of destination pages */
        INIT_LIST_HEAD(&image->dest_pages);
-        /* Initialize the list of unuseable pages */
+        /* Initialize the list of unusable pages */
        INIT_LIST_HEAD(&image->unuseable_pages);
        /* Read in the segments */
@@ -163,7 +164,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
         * just verifies it is an address we can use.
         *
         * Since the kernel does everything in page size chunks ensure
-         * the destination addreses are page aligned.  Too many
+         * the destination addresses are page aligned.  Too many
         * special cases crop of when we don't do this.  The most
         * insidious is getting overlapping destination addresses
         * simply because addresses are changed to page size
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
        /* Deal with the destination pages I have inadvertently allocated.
         *
         * Ideally I would convert multi-page allocations into single
-         * page allocations, and add everyting to image->dest_pages.
+         * page allocations, and add everything to image->dest_pages.
         *
         * For now it is simpler to just free the pages.
         */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
        /* Walk through and free any extra destination pages I may have */
        kimage_free_page_list(&image->dest_pages);
-        /* Walk through and free any unuseable pages I have cached */
+        /* Walk through and free any unusable pages I have cached */
        kimage_free_page_list(&image->unuseable_pages);
 }
@@ -816,7 +817,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                ptr = kmap(page);
                /* Start with a clear page */
-                memset(ptr, 0, PAGE_SIZE);
+                clear_page(ptr);
                ptr += maddr & ~PAGE_MASK;
                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
                if (mchunk > mbytes)
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
        return size;
 }
-static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                           unsigned long end)
 {
        unsigned long addr;
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
        start = roundup(start, PAGE_SIZE);
        end = roundup(start + new_size, PAGE_SIZE);
-        free_reserved_phys_range(end, crashk_res.end);
+        crash_free_reserved_phys_range(end, crashk_res.end);
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
@@ -1529,8 +1531,7 @@ int kernel_kexec(void)
                if (error)
                        goto Enable_cpus;
                local_irq_disable();
-                /* Suspend system devices */
+                error = syscore_suspend();
-                error = sysdev_suspend(PMSG_FREEZE);
                if (error)
                        goto Enable_irqs;
        } else
@@ -1545,7 +1546,7 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
-                sysdev_resume();
+                syscore_resume();
 Enable_irqs:
                local_irq_enable();
 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..47613dfb7b28 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/completion.h>
+#include <linux/cred.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
+#define CAP_BSET        (void *)1
+#define CAP_PI          (void *)2
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
 #ifdef CONFIG_MODULES
 /*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
 static int ____call_usermodehelper(void *data)
 {
        struct subprocess_info *sub_info = data;
+        struct cred *new;
        int retval;
        spin_lock_irq(&current->sighand->siglock);
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data)
         */
        set_user_nice(current, 0);
+        retval = -ENOMEM;
+        new = prepare_kernel_cred(current);
+        if (!new)
+                goto fail;
+        spin_lock(&umh_sysctl_lock);
+        new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+        new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+                                             new->cap_inheritable);
+        spin_unlock(&umh_sysctl_lock);
        if (sub_info->init) {
-                retval = sub_info->init(sub_info);
+                retval = sub_info->init(sub_info, new);
-                if (retval)
+                if (retval) {
+                        abort_creds(new);
                        goto fail;
+                }
        }
+        commit_creds(new);
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
@@ -245,7 +269,6 @@ static void __call_usermodehelper(struct work_struct *work)
        }
 }
-#ifdef CONFIG_PM_SLEEP
 /*
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
@@ -301,6 +324,15 @@ void usermodehelper_enable(void)
        usermodehelper_disabled = 0;
 }
+/**
+ * usermodehelper_is_disabled - check if new helpers are allowed to be started
+ */
+bool usermodehelper_is_disabled(void)
+{
+        return usermodehelper_disabled;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
 static void helper_lock(void)
 {
        atomic_inc(&running_helpers);
@@ -312,12 +344,6 @@ static void helper_unlock(void)
        if (atomic_dec_and_test(&running_helpers))
                wake_up(&running_helpers_waitq);
 }
-#else /* CONFIG_PM_SLEEP */
-#define usermodehelper_disabled 0
-static inline void helper_lock(void) {}
-static inline void helper_unlock(void) {}
-#endif /* CONFIG_PM_SLEEP */
 /**
 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 * context in which call_usermodehelper_exec is called.
 */
 void call_usermodehelper_setfns(struct subprocess_info *info,
-                    int (*init)(struct subprocess_info *info),
+                    int (*init)(struct subprocess_info *info, struct cred *new),
                    void (*cleanup)(struct subprocess_info *info),
                    void *data)
 {
@@ -418,6 +444,84 @@ unlock:
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
+static int proc_cap_handler(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        struct ctl_table t;
+        unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+        kernel_cap_t new_cap;
+        int err, i;
+        if (write && (!capable(CAP_SETPCAP) ||
+                      !capable(CAP_SYS_MODULE)))
+                return -EPERM;
+        /*
+         * convert from the global kernel_cap_t to the ulong array to print to
+         * userspace if this is a read.
+         */
+        spin_lock(&umh_sysctl_lock);
+        for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  {
+                if (table->data == CAP_BSET)
+                        cap_array[i] = usermodehelper_bset.cap[i];
+                else if (table->data == CAP_PI)
+                        cap_array[i] = usermodehelper_inheritable.cap[i];
+                else
+                        BUG();
+        }
+        spin_unlock(&umh_sysctl_lock);
+        t = *table;
+        t.data = &cap_array;
+        /*
+         * actually read or write and array of ulongs from userspace.  Remember
+         * these are least significant 32 bits first
+         */
+        err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+        if (err < 0)
+                return err;
+        /*
+         * convert from the sysctl array of ulongs to the kernel_cap_t
+         * internal representation
+         */
+        for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+                new_cap.cap[i] = cap_array[i];
+        /*
+         * Drop everything not in the new_cap (but don't add things)
+         */
+        spin_lock(&umh_sysctl_lock);
+        if (write) {
+                if (table->data == CAP_BSET)
+                        usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+                if (table->data == CAP_PI)
+                        usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+        }
+        spin_unlock(&umh_sysctl_lock);
+        return 0;
+}
+struct ctl_table usermodehelper_table[] = {
+        {
+                .procname       = "bset",
+                .data           = CAP_BSET,
+                .maxlen         = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+                .mode           = 0600,
+                .proc_handler   = proc_cap_handler,
+        },
+        {
+                .procname       = "inheritable",
+                .data           = CAP_PI,
+                .maxlen         = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+                .mode           = 0600,
+                .proc_handler   = proc_cap_handler,
+        },
+        { }
+};
 void __init usermodehelper_init(void)
 {
        khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
 #include <linux/memory.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/jump_label.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobes_all_disarmed;
-static DEFINE_MUTEX(kprobe_mutex);      /* Protects kprobe_table */
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
        spinlock_t lock ____cacheline_aligned_in_smp;
@@ -315,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
 /* We have preemption disabled.. so it is safe to use __ versions */
 static inline void set_kprobe_instance(struct kprobe *kp)
 {
-        __get_cpu_var(kprobe_instance) = kp;
+        __this_cpu_write(kprobe_instance, kp);
 }
 static inline void reset_kprobe_instance(void)
 {
-        __get_cpu_var(kprobe_instance) = NULL;
+        __this_cpu_write(kprobe_instance, NULL);
 }
 /*
@@ -352,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
        return p->pre_handler == aggr_pre_handler;
 }
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+               list_empty(&p->list);
+}
 /*
 * Keep all fields in the kprobe consistent
 */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
 {
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
 }
 #ifdef CONFIG_OPTPROBES
@@ -382,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
        }
 }
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        arch_remove_kprobe(p);
+        kfree(op);
+}
 /* Return true(!0) if the kprobe is ready for optimization. */
 static inline int kprobe_optready(struct kprobe *p)
 {
@@ -395,11 +415,38 @@ static inline int kprobe_optready(struct kprobe *p)
        return 0;
 }
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+        if (!kprobe_aggrprobe(p))
+                return kprobe_disabled(p);
+        op = container_of(p, struct optimized_kprobe, kp);
+        return kprobe_disabled(p) && list_empty(&op->list);
+}
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 {
        int i;
        struct kprobe *p = NULL;
@@ -420,30 +467,23 @@ struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 /* Optimization staging list, protected by kprobe_mutex */
 static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
+/*
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
 {
-        struct optimized_kprobe *op, *tmp;
+        /* Optimization never be done when disarmed */
+        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
-        /* Lock modules while optimizing kprobes */
+            list_empty(&optimizing_list))
-        mutex_lock(&module_mutex);
+                return;
-        mutex_lock(&kprobe_mutex);
-        if (kprobes_all_disarmed || !kprobes_allow_optimization)
-                goto end;
-        /*
-         * Wait for quiesence period to ensure all running interrupts
-         * are done. Because optprobe may modify multiple instructions
-         * there is a chance that Nth instruction is interrupted. In that
-         * case, running interrupt can return to 2nd-Nth byte of jump
-         * instruction. This wait is for avoiding it.
-         */
-        synchronize_sched();
        /*
         * The optimization/unoptimization refers online_cpus via
@@ -457,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
         */
        get_online_cpus();
        mutex_lock(&text_mutex);
-        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+        arch_optimize_kprobes(&optimizing_list);
-                WARN_ON(kprobe_disabled(&op->kp));
+        mutex_unlock(&text_mutex);
-                if (arch_optimize_kprobe(op) < 0)
+        put_online_cpus();
-                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+}
-                list_del_init(&op->list);
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Unoptimization must be done anytime */
+        if (list_empty(&unoptimizing_list))
+                return;
+        /* Ditto to do_optimize_kprobes */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+        /* Loop free_list for disarming */
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                /* Disarm probes if marked disabled */
+                if (kprobe_disabled(&op->kp))
+                        arch_disarm_kprobe(&op->kp);
+                if (kprobe_unused(&op->kp)) {
+                        /*
+                         * Remove unused probes from hash list. After waiting
+                         * for synchronization, these probes are reclaimed.
+                         * (reclaiming is done by do_free_cleaned_kprobes.)
+                         */
+                        hlist_del_rcu(&op->kp.hlist);
+                } else
+                        list_del_init(&op->list);
        }
        mutex_unlock(&text_mutex);
        put_online_cpus();
-end:
+}
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                BUG_ON(!kprobe_unused(&op->kp));
+                list_del_init(&op->list);
+                free_aggr_kprobe(&op->kp);
+        }
+}
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        LIST_HEAD(free_list);
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        /*
+         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+         * kprobes before waiting for quiesence period.
+         */
+        do_unoptimize_kprobes(&free_list);
+        /*
+         * Step 2: Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /* Step 3: Optimize kprobes after quiesence period */
+        do_optimize_kprobes();
+        /* Step 4: Free cleaned kprobes after quiesence period */
+        do_free_cleaned_kprobes(&free_list);
        mutex_unlock(&kprobe_mutex);
        mutex_unlock(&module_mutex);
+        /* Step 5: Kick optimizer again if needed */
+        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+                kick_kprobe_optimizer();
+        else
+                /* Wake up all waiters */
+                complete_all(&optimizer_comp);
+}
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+        if (delayed_work_pending(&optimizing_work))
+                wait_for_completion(&optimizer_comp);
 }
 /* Optimize kprobe if p is ready to be optimized */
@@ -493,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
                return;
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
-        list_add(&op->list, &optimizing_list);
-        if (!delayed_work_pending(&optimizing_work))
+        if (!list_empty(&op->list))
-                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+                /* This is under unoptimizing. Just dequeue the probe */
+                list_del_init(&op->list);
+        else {
+                list_add(&op->list, &optimizing_list);
+                kick_kprobe_optimizer();
+        }
+}
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+        get_online_cpus();
+        arch_unoptimize_kprobe(op);
+        put_online_cpus();
+        if (kprobe_disabled(&op->kp))
+                arch_disarm_kprobe(&op->kp);
 }
 /* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
 {
        struct optimized_kprobe *op;
-        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
-                op = container_of(p, struct optimized_kprobe, kp);
+                return; /* This is not an optprobe nor optimized */
-                if (!list_empty(&op->list))
-                        /* Dequeue from the optimization queue */
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!kprobe_optimized(p)) {
+                /* Unoptimized or unoptimizing case */
+                if (force && !list_empty(&op->list)) {
+                        /*
+                         * Only if this is unoptimizing kprobe and forced,
+                         * forcibly unoptimize it. (No need to unoptimize
+                         * unoptimized kprobe again :)
+                         */
                        list_del_init(&op->list);
-                else
+                        force_unoptimize_kprobe(op);
-                        /* Replace jump with break */
+                }
-                        arch_unoptimize_kprobe(op);
+                return;
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                return;
+        }
+        /* Optimized kprobe case */
+        if (force)
+                /* Forcibly update the code: this is a special case */
+                force_unoptimize_kprobe(op);
+        else {
+                list_add(&op->list, &unoptimizing_list);
+                kick_kprobe_optimizer();
        }
 }
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        struct optimized_kprobe *op;
+        BUG_ON(!kprobe_unused(ap));
+        /*
+         * Unused kprobe MUST be on the way of delayed unoptimizing (means
+         * there is still a relative jump) and disabled.
+         */
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (unlikely(list_empty(&op->list)))
+                printk(KERN_WARNING "Warning: found a stray unused "
+                        "aggrprobe@%p\n", ap->addr);
+        /* Enable the probe again */
+        ap->flags &= ~KPROBE_FLAG_DISABLED;
+        /* Optimize it again (remove from op->list) */
+        BUG_ON(!kprobe_optready(ap));
+        optimize_kprobe(ap);
+}
 /* Remove optimized instructions */
 static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
        op = container_of(p, struct optimized_kprobe, kp);
-        if (!list_empty(&op->list)) {
+        if (!list_empty(&op->list))
-                /* Dequeue from the optimization queue */
+                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        /* Don't unoptimize, because the target code will be freed. */
+        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
 }
@@ -541,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
        arch_prepare_optimized_kprobe(op);
 }
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
-        struct optimized_kprobe *op;
-        op = container_of(p, struct optimized_kprobe, kp);
-        arch_remove_optimized_kprobe(op);
-        kfree(op);
-}
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
 static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 {
@@ -585,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
-                free_aggr_kprobe(ap);
+                arch_remove_optimized_kprobe(op);
+                kfree(op);
                return;
        }
@@ -594,6 +776,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
 }
 #ifdef CONFIG_SYSCTL
+/* This should be called with kprobe_mutex locked */
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -606,17 +789,16 @@ static void __kprobes optimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = true;
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
-        mutex_unlock(&text_mutex);
        printk(KERN_INFO "Kprobes globally optimized\n");
 }
+/* This should be called with kprobe_mutex locked */
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -629,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = false;
-        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!kprobe_disabled(p))
-                                unoptimize_kprobe(p);
+                                unoptimize_kprobe(p, false);
                }
        }
+        /* Wait for unoptimizing completion */
-        mutex_unlock(&text_mutex);
+        wait_for_kprobe_optimizer();
-        put_online_cpus();
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
 }
 int sysctl_kprobes_optimization;
@@ -667,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
 static void __kprobes __arm_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
        /* Check collision with other optimized kprobes */
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        _p = get_optimized_kprobe((unsigned long)p->addr);
-        if (unlikely(old_p))
+        if (unlikely(_p))
-                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+                /* Fallback to unoptimized kprobe */
+                unoptimize_kprobe(_p, true);
        arch_arm_kprobe(p);
        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
 }
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
-        unoptimize_kprobe(p);   /* Try to unoptimize */
+        unoptimize_kprobe(p, false);    /* Try to unoptimize */
-        arch_disarm_kprobe(p);
-        /* If another kprobe was blocked, optimize it. */
+        if (!kprobe_queued(p)) {
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+                arch_disarm_kprobe(p);
-        if (unlikely(old_p))
+                /* If another kprobe was blocked, optimize it. */
-                optimize_kprobe(old_p);
+                _p = get_optimized_kprobe((unsigned long)p->addr);
+                if (unlikely(_p) && reopt)
+                        optimize_kprobe(_p);
+        }
+        /* TODO: reoptimize others after unoptimized this probe */
 }
 #else /* !CONFIG_OPTPROBES */
 #define optimize_kprobe(p)                      do {} while (0)
-#define unoptimize_kprobe(p)                    do {} while (0)
+#define unoptimize_kprobe(p, f)                 do {} while (0)
 #define kill_optimized_kprobe(p)                do {} while (0)
 #define prepare_optimized_kprobe(p)             do {} while (0)
 #define try_to_optimize_kprobe(p)               do {} while (0)
 #define __arm_kprobe(p)                         arch_arm_kprobe(p)
-#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o)                   arch_disarm_kprobe(p)
+#define kprobe_disarmed(p)                      kprobe_disabled(p)
+#define wait_for_kprobe_optimizer()             do {} while (0)
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+        BUG_ON(kprobe_unused(ap));
+}
 static __kprobes void free_aggr_kprobe(struct kprobe *p)
 {
+        arch_remove_kprobe(p);
        kfree(p);
 }
@@ -730,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        /* Ditto */
        mutex_lock(&text_mutex);
-        __disarm_kprobe(kp);
+        __disarm_kprobe(kp, true);
        mutex_unlock(&text_mutex);
-        put_online_cpus();
 }
 /*
@@ -773,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                                        int trapnr)
 {
-        struct kprobe *cur = __get_cpu_var(kprobe_instance);
+        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        /*
         * if we faulted "during" the execution of a user specified
@@ -788,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-        struct kprobe *cur = __get_cpu_var(kprobe_instance);
+        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        int ret = 0;
        if (cur && cur->break_handler) {
@@ -831,6 +1023,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
                         struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        spinlock_t *hlist_lock;
@@ -842,6 +1035,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 static void __kprobes kretprobe_table_lock(unsigned long hash,
        unsigned long *flags)
+__acquires(hlist_lock)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +1043,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
        unsigned long *flags)
+__releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        spinlock_t *hlist_lock;
@@ -857,7 +1052,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
        spin_unlock_irqrestore(hlist_lock, *flags);
 }
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_unlock_irqrestore(hlist_lock, *flags);
@@ -935,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
        if (p->break_handler || p->post_handler)
-                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
+                unoptimize_kprobe(ap, true);    /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
@@ -986,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                                          struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *ap = old_p;
+        struct kprobe *ap = orig_p;
-        if (!kprobe_aggrprobe(old_p)) {
+        if (!kprobe_aggrprobe(orig_p)) {
-                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
+                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = alloc_aggr_kprobe(old_p);
+                ap = alloc_aggr_kprobe(orig_p);
                if (!ap)
                        return -ENOMEM;
-                init_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, orig_p);
-        }
+        } else if (kprobe_unused(ap))
+                /* This probe is going to die. Rescue it */
+                reuse_unused_kprobe(ap);
        if (kprobe_gone(ap)) {
                /*
@@ -1032,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        return add_new_kprobe(ap, p);
 }
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
-        struct kprobe *kp;
-        list_for_each_entry_rcu(kp, &p->list, list) {
-                if (!kprobe_disabled(kp))
-                        /*
-                         * There is an active probe on the list.
-                         * We can't disable aggr_kprobe.
-                         */
-                        return 0;
-        }
-        p->flags |= KPROBE_FLAG_DISABLED;
-        return 1;
-}
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
        struct kprobe_blackpoint *kb;
@@ -1091,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = get_kprobe(p->addr);
+        ap = get_kprobe(p->addr);
-        if (unlikely(!old_p))
+        if (unlikely(!ap))
                return NULL;
-        if (p != old_p) {
+        if (p != ap) {
-                list_for_each_entry_rcu(list_p, &old_p->list, list)
+                list_for_each_entry_rcu(list_p, &ap->list, list)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid;
                return NULL;
        }
 valid:
-        return old_p;
+        return ap;
 }
 /* Return error if the kprobe is being re-registered */
 static inline int check_kprobe_rereg(struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *old_p;
        mutex_lock(&kprobe_mutex);
-        old_p = __get_valid_kprobe(p);
+        if (__get_valid_kprobe(p))
-        if (old_p)
                ret = -EINVAL;
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1138,13 +1319,13 @@ int __kprobes register_kprobe(struct kprobe *p)
        if (ret)
                return ret;
+        jump_label_lock();
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr) ||
-            ftrace_text_reserved(p->addr, p->addr)) {
+            ftrace_text_reserved(p->addr, p->addr) ||
-                preempt_enable();
+            jump_label_text_reserved(p->addr, p->addr))
-                return -EINVAL;
+                goto fail_with_jump_label;
-        }
        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
@@ -1158,10 +1339,9 @@ int __kprobes register_kprobe(struct kprobe *p)
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
                 */
-                if (unlikely(!try_module_get(probed_mod))) {
+                if (unlikely(!try_module_get(probed_mod)))
-                        preempt_enable();
+                        goto fail_with_jump_label;
-                        return -EINVAL;
-                }
                /*
                 * If the module freed .init.text, we couldn't insert
                 * kprobes in there.
@@ -1169,16 +1349,18 @@ int __kprobes register_kprobe(struct kprobe *p)
                if (within_module_init((unsigned long)p->addr, probed_mod) &&
                    probed_mod->state != MODULE_STATE_COMING) {
                        module_put(probed_mod);
-                        preempt_enable();
+                        goto fail_with_jump_label;
-                        return -EINVAL;
                }
        }
        preempt_enable();
+        jump_label_unlock();
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
        mutex_lock(&kprobe_mutex);
+        jump_label_lock(); /* needed to call jump_label_text_reserved() */
        get_online_cpus();      /* For avoiding text_mutex deadlock. */
        mutex_lock(&text_mutex);
@@ -1206,76 +1388,136 @@ int __kprobes register_kprobe(struct kprobe *p)
 out:
        mutex_unlock(&text_mutex);
        put_online_cpus();
+        jump_label_unlock();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
                module_put(probed_mod);
        return ret;
+fail_with_jump_label:
+        preempt_enable();
+        jump_label_unlock();
+        return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &ap->list, list)
+                if (!kprobe_disabled(kp))
+                        /*
+                         * There is an active probe on the list.
+                         * We can't disable this ap.
+                         */
+                        return 0;
+        return 1;
+}
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+        struct kprobe *orig_p;
+        /* Get an original kprobe for return */
+        orig_p = __get_valid_kprobe(p);
+        if (unlikely(orig_p == NULL))
+                return NULL;
+        if (!kprobe_disabled(p)) {
+                /* Disable probe if it is a child probe */
+                if (p != orig_p)
+                        p->flags |= KPROBE_FLAG_DISABLED;
+                /* Try to disarm and disable this/parent probe */
+                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+                        disarm_kprobe(orig_p);
+                        orig_p->flags |= KPROBE_FLAG_DISABLED;
+                }
+        }
+        return orig_p;
+}
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
 static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = __get_valid_kprobe(p);
+        /* Disable kprobe. This will disarm it if needed. */
-        if (old_p == NULL)
+        ap = __disable_kprobe(p);
+        if (ap == NULL)
                return -EINVAL;
-        if (old_p == p ||
+        if (ap == p)
-            (kprobe_aggrprobe(old_p) &&
-             list_is_singular(&old_p->list))) {
                /*
-                 * Only probe on the hash list. Disarm only if kprobes are
+                 * This probe is an independent(and non-optimized) kprobe
-                 * enabled and not gone - otherwise, the breakpoint would
+                 * (not an aggrprobe). Remove from the hash list.
-                 * already have been removed. We save on flushing icache.
                 */
-                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
+                goto disarmed;
-                        disarm_kprobe(old_p);
-                hlist_del_rcu(&old_p->hlist);
+        /* Following process expects this probe is an aggrprobe */
-        } else {
+        WARN_ON(!kprobe_aggrprobe(ap));
+        if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+                /*
+                 * !disarmed could be happen if the probe is under delayed
+                 * unoptimizing.
+                 */
+                goto disarmed;
+        else {
+                /* If disabling probe has special handlers, update aggrprobe */
                if (p->break_handler && !kprobe_gone(p))
-                        old_p->break_handler = NULL;
+                        ap->break_handler = NULL;
                if (p->post_handler && !kprobe_gone(p)) {
-                        list_for_each_entry_rcu(list_p, &old_p->list, list) {
+                        list_for_each_entry_rcu(list_p, &ap->list, list) {
                                if ((list_p != p) && (list_p->post_handler))
                                        goto noclean;
                        }
-                        old_p->post_handler = NULL;
+                        ap->post_handler = NULL;
                }
 noclean:
+                /*
+                 * Remove from the aggrprobe: this path will do nothing in
+                 * __unregister_kprobe_bottom().
+                 */
                list_del_rcu(&p->list);
-                if (!kprobe_disabled(old_p)) {
+                if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
-                        try_to_disable_aggr_kprobe(old_p);
+                        /*
-                        if (!kprobes_all_disarmed) {
+                         * Try to optimize this probe again, because post
-                                if (kprobe_disabled(old_p))
+                         * handler may have been changed.
-                                        disarm_kprobe(old_p);
+                         */
-                                else
+                        optimize_kprobe(ap);
-                                        /* Try to optimize this probe again */
-                                        optimize_kprobe(old_p);
-                        }
-                }
        }
        return 0;
+disarmed:
+        BUG_ON(!kprobe_disarmed(ap));
+        hlist_del_rcu(&ap->hlist);
+        return 0;
 }
 static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *ap;
        if (list_empty(&p->list))
+                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
-                /* "p" is the last child of an aggr_kprobe */
+                /* This is the last child of an aggrprobe */
-                old_p = list_entry(p->list.next, struct kprobe, list);
+                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
-                arch_remove_kprobe(old_p);
+                free_aggr_kprobe(ap);
-                free_aggr_kprobe(old_p);
        }
+        /* Otherwise, do nothing. */
 }
 int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1339,18 +1581,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
-                unsigned long addr;
+                unsigned long addr, offset;
                jp = jps[i];
                addr = arch_deref_entry_point(jp->entry);
-                if (!kernel_text_address(addr))
+                /* Verify probepoint is a function entry point */
-                        ret = -EINVAL;
+                if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
-                else {
+                    offset == 0) {
-                        /* Todo: Verify probepoint is a function entry point */
                        jp->kp.pre_handler = setjmp_pre_handler;
                        jp->kp.break_handler = longjmp_break_handler;
                        ret = register_kprobe(&jp->kp);
-                }
+                } else
+                        ret = -EINVAL;
                if (ret < 0) {
                        if (i > 0)
                                unregister_jprobes(jps, i);
@@ -1592,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 int __kprobes disable_kprobe(struct kprobe *kp)
 {
        int ret = 0;
-        struct kprobe *p;
        mutex_lock(&kprobe_mutex);
-        /* Check whether specified probe is valid. */
+        /* Disable this kprobe */
-        p = __get_valid_kprobe(kp);
+        if (__disable_kprobe(kp) == NULL)
-        if (unlikely(p == NULL)) {
                ret = -EINVAL;
-                goto out;
-        }
-        /* If the probe is already disabled (or gone), just return */
-        if (kprobe_disabled(kp))
-                goto out;
-        kp->flags |= KPROBE_FLAG_DISABLED;
-        if (p != kp)
-                /* When kp != p, p is always enabled. */
-                try_to_disable_aggr_kprobe(p);
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                disarm_kprobe(p);
-out:
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1912,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
        mutex_lock(&kprobe_mutex);
        /* If kprobes are already disarmed, just return */
-        if (kprobes_all_disarmed)
+        if (kprobes_all_disarmed) {
-                goto already_disabled;
+                mutex_unlock(&kprobe_mutex);
+                return;
+        }
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
-        /*
-         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
-         * because disarming may also unoptimize kprobes.
-         */
-        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                __disarm_kprobe(p);
+                                __disarm_kprobe(p, false);
                }
        }
        mutex_unlock(&text_mutex);
-        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
-        return;
-already_disabled:
+        /* Wait for disarming all kprobes by optimizer */
-        mutex_unlock(&kprobe_mutex);
+        wait_for_kprobe_optimizer();
-        return;
 }
 /*
@@ -1992,6 +2210,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 static const struct file_operations fops_kp = {
        .read =         read_enabled_file_bool,
        .write =        write_enabled_file_bool,
+        .llseek =       default_llseek,
 };
 static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
 #include <linux/kexec.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
 #endif /* CONFIG_KEXEC */
+/* whether file capabilities are enabled */
+static ssize_t fscaps_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", file_caps_enabled);
+}
+KERNEL_ATTR_RO(fscaps);
 /*
 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
 EXPORT_SYMBOL_GPL(kernel_kobj);
 static struct attribute * kernel_attrs[] = {
+        &fscaps_attr.attr,
 #if defined(CONFIG_HOTPLUG)
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
+        int node;
        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
        do_exit(ret);
 }
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+        if (tsk == kthreadd_task)
+                return tsk->pref_node_fork;
+#endif
+        return numa_node_id();
+}
 static void create_kthread(struct kthread_create_info *create)
 {
        int pid;
+#ifdef CONFIG_NUMA
+        current->pref_node_fork = create->node;
+#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
 }
 /**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
+ * @node: memory node number.
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().
 *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
+ * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM).
 */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-                                   void *data,
+                                           void *data,
-                                   const char namefmt[],
+                                           int node,
-                                   ...)
+                                           const char namefmt[],
+                                           ...)
 {
        struct kthread_create_info create;
        create.threadfn = threadfn;
        create.data = data;
+        create.node = node;
        init_completion(&create.done);
        spin_lock(&kthread_create_lock);
@@ -148,7 +167,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
-                struct sched_param param = { .sched_priority = 0 };
+                static const struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        }
        return create.result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
 /**
 * kthread_bind - bind a just-created kthread to a cpu.
@@ -183,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
                return;
        }
-        p->cpus_allowed = cpumask_of_cpu(cpu);
+        /* It's safe because the task is inactive. */
-        p->rt.nr_cpus_allowed = 1;
+        do_set_cpus_allowed(p, cpumask_of(cpu));
        p->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
@@ -265,6 +284,17 @@ int kthreadd(void *unused)
        return 0;
 }
+void __init_kthread_worker(struct kthread_worker *worker,
+                                const char *name,
+                                struct lock_class_key *key)
+{
+        spin_lock_init(&worker->lock);
+        lockdep_set_class_and_name(&worker->lock, key, name);
+        INIT_LIST_HEAD(&worker->work_list);
+        worker->task = NULL;
+}
+EXPORT_SYMBOL_GPL(__init_kthread_worker);
 /**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
 }
 /**
- * __account_scheduler_latency - record an occured latency
+ * __account_scheduler_latency - record an occurred latency
 * @tsk - the task struct of the task hitting the latency
 * @usecs - the duration of the latency in microseconds
 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        account_global_scheduler_latency(tsk, &lat);
-        /*
+        for (i = 0; i < tsk->latency_record_count; i++) {
-         * short term hack; if we're > 32 we stop; future we recycle:
-         */
-        tsk->latency_record_count++;
-        if (tsk->latency_record_count >= LT_SAVECOUNT)
-                goto out_unlock;
-        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *mylat;
                int same = 1;
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
                }
        }
+        /*
+         * short term hack; if we're > 32 we stop; future we recycle:
+         */
+        if (tsk->latency_record_count >= LT_SAVECOUNT)
+                goto out_unlock;
        /* Allocated a new one: */
-        i = tsk->latency_record_count;
+        i = tsk->latency_record_count++;
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
@@ -242,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < MAXLR; i++) {
-                if (latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %lu %lu ",
+                        seq_printf(m, "%i %lu %lu",
-                                latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                latency_record[i].time,
-                                latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!latency_record[i].backtrace[q])
                                        break;
-                                if (latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
                        seq_printf(m, "\n");
                }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
        usage[i] = '\0';
 }
+static int __print_lock_name(struct lock_class *class)
+{
+        char str[KSYM_NAME_LEN];
+        const char *name;
+        name = class->name;
+        if (!name)
+                name = __get_key_name(class->key, str);
+        return printk("%s", name);
+}
 static void print_lock_name(struct lock_class *class)
 {
        char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -639,6 +651,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        }
 #endif
+        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+                debug_locks_off();
+                printk(KERN_ERR
+                        "BUG: looking up invalid subclass: %u\n", subclass);
+                printk(KERN_ERR
+                        "turning off the locking correctness validator.\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * Static locks do not have their class-keys yet - for them the key
         * is the lock object itself:
@@ -774,7 +796,9 @@ out_unlock_set:
        raw_local_irq_restore(flags);
        if (!subclass || force)
-                lock->class_cache = class;
+                lock->class_cache[0] = class;
+        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+                lock->class_cache[subclass] = class;
        if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                return NULL;
@@ -1041,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
        return 0;
 }
+static void
+print_circular_lock_scenario(struct held_lock *src,
+                             struct held_lock *tgt,
+                             struct lock_list *prt)
+{
+        struct lock_class *source = hlock_class(src);
+        struct lock_class *target = hlock_class(tgt);
+        struct lock_class *parent = prt->class;
+        /*
+         * A direct locking problem where unsafe_class lock is taken
+         * directly by safe_class lock, then all we need to show
+         * is the deadlock scenario, as it is obvious that the
+         * unsafe lock is taken under the safe lock.
+         *
+         * But if there is a chain instead, where the safe lock takes
+         * an intermediate lock (middle_class) where this lock is
+         * not the same as the safe lock, then the lock chain is
+         * used to describe the problem. Otherwise we would need
+         * to show a different CPU case for each link in the chain
+         * from the safe_class lock to the unsafe_class lock.
+         */
+        if (parent != source) {
+                printk("Chain exists of:\n  ");
+                __print_lock_name(source);
+                printk(" --> ");
+                __print_lock_name(parent);
+                printk(" --> ");
+                __print_lock_name(target);
+                printk("\n\n");
+        }
+        printk(" Possible unsafe locking scenario:\n\n");
+        printk("       CPU0                    CPU1\n");
+        printk("       ----                    ----\n");
+        printk("  lock(");
+        __print_lock_name(target);
+        printk(");\n");
+        printk("                               lock(");
+        __print_lock_name(parent);
+        printk(");\n");
+        printk("                               lock(");
+        __print_lock_name(target);
+        printk(");\n");
+        printk("  lock(");
+        __print_lock_name(source);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+}
 /*
 * When a circular dependency is detected, print the
 * header first:
@@ -1084,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
 {
        struct task_struct *curr = current;
        struct lock_list *parent;
+        struct lock_list *first_parent;
        int depth;
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1097,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
        print_circular_bug_header(target, depth, check_src, check_tgt);
        parent = get_lock_parent(target);
+        first_parent = parent;
        while (parent) {
                print_circular_bug_entry(parent, --depth);
@@ -1104,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
        }
        printk("\nother info that might help us debug this:\n\n");
+        print_circular_lock_scenario(check_src, check_tgt,
+                                     first_parent);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -1302,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
                printk("\n");
                if (depth == 0 && (entry != root)) {
-                        printk("lockdep:%s bad BFS generated tree\n", __func__);
+                        printk("lockdep:%s bad path found in chain graph\n", __func__);
                        break;
                }
@@ -1313,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
        return;
 }
+static void
+print_irq_lock_scenario(struct lock_list *safe_entry,
+                        struct lock_list *unsafe_entry,
+                        struct lock_class *prev_class,
+                        struct lock_class *next_class)
+{
+        struct lock_class *safe_class = safe_entry->class;
+        struct lock_class *unsafe_class = unsafe_entry->class;
+        struct lock_class *middle_class = prev_class;
+        if (middle_class == safe_class)
+                middle_class = next_class;
+        /*
+         * A direct locking problem where unsafe_class lock is taken
+         * directly by safe_class lock, then all we need to show
+         * is the deadlock scenario, as it is obvious that the
+         * unsafe lock is taken under the safe lock.
+         *
+         * But if there is a chain instead, where the safe lock takes
+         * an intermediate lock (middle_class) where this lock is
+         * not the same as the safe lock, then the lock chain is
+         * used to describe the problem. Otherwise we would need
+         * to show a different CPU case for each link in the chain
+         * from the safe_class lock to the unsafe_class lock.
+         */
+        if (middle_class != unsafe_class) {
+                printk("Chain exists of:\n  ");
+                __print_lock_name(safe_class);
+                printk(" --> ");
+                __print_lock_name(middle_class);
+                printk(" --> ");
+                __print_lock_name(unsafe_class);
+                printk("\n\n");
+        }
+        printk(" Possible interrupt unsafe locking scenario:\n\n");
+        printk("       CPU0                    CPU1\n");
+        printk("       ----                    ----\n");
+        printk("  lock(");
+        __print_lock_name(unsafe_class);
+        printk(");\n");
+        printk("                               local_irq_disable();\n");
+        printk("                               lock(");
+        __print_lock_name(safe_class);
+        printk(");\n");
+        printk("                               lock(");
+        __print_lock_name(middle_class);
+        printk(");\n");
+        printk("  <Interrupt>\n");
+        printk("    lock(");
+        __print_lock_name(safe_class);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+}
 static int
 print_bad_irq_dependency(struct task_struct *curr,
                         struct lock_list *prev_root,
@@ -1364,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
        print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
        printk("\nother info that might help us debug this:\n\n");
+        print_irq_lock_scenario(backwards_entry, forwards_entry,
+                                hlock_class(prev), hlock_class(next));
        lockdep_print_held_locks(curr);
        printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1527,6 +1665,26 @@ static inline void inc_chains(void)
 #endif
+static void
+print_deadlock_scenario(struct held_lock *nxt,
+                             struct held_lock *prv)
+{
+        struct lock_class *next = hlock_class(nxt);
+        struct lock_class *prev = hlock_class(prv);
+        printk(" Possible unsafe locking scenario:\n\n");
+        printk("       CPU0\n");
+        printk("       ----\n");
+        printk("  lock(");
+        __print_lock_name(prev);
+        printk(");\n");
+        printk("  lock(");
+        __print_lock_name(next);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+        printk(" May be due to missing lock nesting notation\n\n");
+}
 static int
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                   struct held_lock *next)
@@ -1545,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        print_lock(prev);
        printk("\nother info that might help us debug this:\n");
+        print_deadlock_scenario(next, prev);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -1814,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
        struct held_lock *hlock_curr, *hlock_next;
-        int i, j, n, cn;
+        int i, j;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -1874,15 +2033,9 @@ cache_hit:
        }
        i++;
        chain->depth = curr->lockdep_depth + 1 - i;
-        cn = nr_chain_hlocks;
+        if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
-        while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
+                chain->base = nr_chain_hlocks;
-                n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
+                nr_chain_hlocks += chain->depth;
-                if (n == cn)
-                        break;
-                cn = n;
-        }
-        if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
-                chain->base = cn;
                for (j = 0; j < chain->depth - 1; j++, i++) {
                        int lock_id = curr->held_locks[i].class_idx - 1;
                        chain_hlocks[chain->base + j] = lock_id;
@@ -1999,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
+static void
+print_usage_bug_scenario(struct held_lock *lock)
+{
+        struct lock_class *class = hlock_class(lock);
+        printk(" Possible unsafe locking scenario:\n\n");
+        printk("       CPU0\n");
+        printk("       ----\n");
+        printk("  lock(");
+        __print_lock_name(class);
+        printk(");\n");
+        printk("  <Interrupt>\n");
+        printk("    lock(");
+        __print_lock_name(class);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+}
 static int
 print_usage_bug(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2027,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        print_irqtrace_events(curr);
        printk("\nother info that might help us debug this:\n");
+        print_usage_bug_scenario(this);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -2061,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
                        struct held_lock *this, int forwards,
                        const char *irqclass)
 {
+        struct lock_list *entry = other;
+        struct lock_list *middle = NULL;
+        int depth;
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
@@ -2079,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
        printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
        printk("\nother info that might help us debug this:\n");
+        /* Find a middle lock (if one exists) */
+        depth = get_lock_depth(other);
+        do {
+                if (depth == 0 && (entry != root)) {
+                        printk("lockdep:%s bad path found in chain graph\n", __func__);
+                        break;
+                }
+                middle = entry;
+                entry = get_lock_parent(entry);
+                depth--;
+        } while (entry && entry != root && (depth >= 0));
+        if (forwards)
+                print_irq_lock_scenario(root, other,
+                        middle ? middle->class : root->class, other->class);
+        else
+                print_irq_lock_scenario(other, root,
+                        middle ? middle->class : other->class, root->class);
        lockdep_print_held_locks(curr);
        printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2280,22 +2476,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 }
 /*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-void early_boot_irqs_off(void)
-{
-        early_boot_irqs_enabled = 0;
-}
-void early_boot_irqs_on(void)
-{
-        early_boot_irqs_enabled = 1;
-}
-/*
 * Hardirqs will be enabled:
 */
 void trace_hardirqs_on_caller(unsigned long ip)
@@ -2307,13 +2487,13 @@ void trace_hardirqs_on_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
        if (unlikely(curr->hardirqs_enabled)) {
                /*
                 * Neither irq nor preemption are disabled here
-                 * so this is racy by nature but loosing one hit
+                 * so this is racy by nature but losing one hit
                 * in a stat is not a big deal.
                 */
                __debug_atomic_inc(redundant_hardirqs_on);
@@ -2624,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        if (!graph_lock())
                return 0;
        /*
-         * Make sure we didnt race:
+         * Make sure we didn't race:
         */
        if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
                graph_unlock();
@@ -2679,7 +2859,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        lock->class_cache = NULL;
+        int i;
+        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+                lock->class_cache[i] = NULL;
 #ifdef CONFIG_LOCK_STAT
        lock->cpu = raw_smp_processor_id();
 #endif
@@ -2739,21 +2923,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
-        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
-                debug_locks_off();
-                printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
-                printk("turning off the locking correctness validator.\n");
-                dump_stack();
-                return 0;
-        }
        if (lock->key == &__lockdep_no_validate__)
                check = 1;
-        if (!subclass)
+        if (subclass < NR_LOCKDEP_CACHING_CLASSES)
-                class = lock->class_cache;
+                class = lock->class_cache[subclass];
        /*
-         * Not cached yet or subclass?
+         * Not cached?
         */
        if (unlikely(!class)) {
                class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +3094,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                return 1;
        if (hlock->references) {
-                struct lock_class *class = lock->class_cache;
+                struct lock_class *class = lock->class_cache[0];
                if (!class)
                        class = look_up_lock_class(lock, 0);
@@ -3250,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
        int ret = 0;
        if (unlikely(current->lockdep_recursion))
-                return ret;
+                return 1; /* avoid false negative lockdep_assert_held() */
        raw_local_irq_save(flags);
        check_flags(flags);
@@ -3559,7 +3735,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                if (list_empty(head))
                        continue;
                list_for_each_entry_safe(class, next, head, hash_entry) {
-                        if (unlikely(class == lock->class_cache)) {
+                        int match = 0;
+                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+                                match |= class == lock->class_cache[j];
+                        if (unlikely(match)) {
                                if (debug_locks_off_graph_unlock())
                                        WARN_ON(1);
                                goto out_restore;
@@ -3775,7 +3956,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
 * Careful: only use this function if you are sure that
 * the task cannot run in parallel!
 */
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
 {
        if (unlikely(!debug_locks)) {
                printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3964,6 @@ void __debug_show_held_locks(struct task_struct *task)
        }
        lockdep_print_held_locks(task);
 }
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-void debug_show_held_locks(struct task_struct *task)
-{
-                __debug_show_held_locks(task);
-}
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                      nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
                      nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
                      nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
-                      sum_forward_deps = 0, factor = 0;
+                      sum_forward_deps = 0;
        list_for_each_entry(class, &all_lock_classes, lock_entry) {
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                        nr_hardirq_unsafe * nr_hardirq_safe +
                        nr_list_entries);
-        /*
-         * Estimated factor between direct and indirect
-         * dependencies:
-         */
-        if (nr_list_entries)
-                factor = sum_forward_deps / nr_list_entries;
 #ifdef CONFIG_PROVE_LOCKING
        seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
                        nr_lock_chains, MAX_LOCKDEP_CHAINS);
@@ -494,7 +487,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                namelen += 2;
        for (i = 0; i < LOCKSTAT_POINTS; i++) {
-                char sym[KSYM_SYMBOL_LEN];
                char ip[32];
                if (class->contention_point[i] == 0)
@@ -503,15 +495,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                if (!i)
                        seq_line(m, '-', 40-namelen, namelen);
-                sprint_symbol(sym, class->contention_point[i]);
                snprintf(ip, sizeof(ip), "[<%p>]",
                                (void *)class->contention_point[i]);
-                seq_printf(m, "%40s %14lu %29s %s\n", name,
+                seq_printf(m, "%40s %14lu %29s %pS\n",
-                                stats->contention_point[i],
+                           name, stats->contention_point[i],
-                                ip, sym);
+                           ip, (void *)class->contention_point[i]);
        }
        for (i = 0; i < LOCKSTAT_POINTS; i++) {
-                char sym[KSYM_SYMBOL_LEN];
                char ip[32];
                if (class->contending_point[i] == 0)
@@ -520,12 +510,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                if (!i)
                        seq_line(m, '-', 40-namelen, namelen);
-                sprint_symbol(sym, class->contending_point[i]);
                snprintf(ip, sizeof(ip), "[<%p>]",
                                (void *)class->contending_point[i]);
-                seq_printf(m, "%40s %14lu %29s %s\n", name,
+                seq_printf(m, "%40s %14lu %29s %pS\n",
-                                stats->contending_point[i],
+                           name, stats->contending_point[i],
-                                ip, sym);
+                           ip, (void *)class->contending_point[i]);
        }
        if (i) {
                seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index ccd641991842..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,9 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/jump_label.h>
+#include <linux/pfn.h>
+#include <linux/bsearch.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -69,6 +72,26 @@
 #define ARCH_SHF_SMALL 0
 #endif
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?         \
+                (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+                         PFN_DOWN((unsigned long)BASE) + 1)     \
+                : (0UL))
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -218,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
                                   struct module *owner,
                                   bool (*fn)(const struct symsearch *syms,
                                              struct module *owner,
-                                              unsigned int symnum, void *data),
+                                              void *data),
                                   void *data)
 {
-        unsigned int i, j;
+        unsigned int j;
        for (j = 0; j < arrsize; j++) {
-                for (i = 0; i < arr[j].stop - arr[j].start; i++)
+                if (fn(&arr[j], owner, data))
-                        if (fn(&arr[j], owner, i, data))
+                        return true;
-                                return true;
        }
        return false;
 }
 /* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
-                            unsigned int symnum, void *data), void *data)
+                                    struct module *owner,
+                                    void *data),
+                         void *data)
 {
        struct module *mod;
        static const struct symsearch arr[] = {
@@ -287,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
        }
        return false;
 }
-EXPORT_SYMBOL_GPL(each_symbol);
+EXPORT_SYMBOL_GPL(each_symbol_section);
 struct find_symbol_arg {
        /* Input */
@@ -301,15 +325,12 @@ struct find_symbol_arg {
        const struct kernel_symbol *sym;
 };
-static bool find_symbol_in_section(const struct symsearch *syms,
+static bool check_symbol(const struct symsearch *syms,
-                                   struct module *owner,
+                                 struct module *owner,
-                                   unsigned int symnum, void *data)
+                                 unsigned int symnum, void *data)
 {
        struct find_symbol_arg *fsa = data;
-        if (strcmp(syms->start[symnum].name, fsa->name) != 0)
-                return false;
        if (!fsa->gplok) {
                if (syms->licence == GPL_ONLY)
                        return false;
@@ -343,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
        return true;
 }
+static int cmp_name(const void *va, const void *vb)
+{
+        const char *a;
+        const struct kernel_symbol *b;
+        a = va; b = vb;
+        return strcmp(a, b->name);
+}
+static bool find_symbol_in_section(const struct symsearch *syms,
+                                   struct module *owner,
+                                   void *data)
+{
+        struct find_symbol_arg *fsa = data;
+        struct kernel_symbol *sym;
+        sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+                        sizeof(struct kernel_symbol), cmp_name);
+        if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+                return true;
+        return false;
+}
 /* Find a symbol and return it, along with, (optional) crc and
 * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
 const struct kernel_symbol *find_symbol(const char *name,
@@ -357,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
        fsa.gplok = gplok;
        fsa.warn = warn;
-        if (each_symbol(find_symbol_in_section, &fsa)) {
+        if (each_symbol_section(find_symbol_in_section, &fsa)) {
                if (owner)
                        *owner = fsa.owner;
                if (crc)
@@ -787,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                wait_for_zero_refcount(mod);
        mutex_unlock(&module_mutex);
-        /* Final destruction now noone is using it. */
+        /* Final destruction now no one is using it. */
        if (mod->exit != NULL)
                mod->exit();
        blocking_notifier_call_chain(&module_notify_list,
@@ -1146,7 +1191,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 {
        struct module_sect_attr *sattr =
                container_of(mattr, struct module_sect_attr, mattr);
-        return sprintf(buf, "0x%lx\n", sattr->address);
+        return sprintf(buf, "0x%pK\n", (void *)sattr->address);
 }
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1541,6 +1586,117 @@ static int __unlink_module(void *_mod)
        return 0;
 }
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+        unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+        unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+        if (end_pfn > begin_pfn)
+                set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+static void set_section_ro_nx(void *base,
+                        unsigned long text_size,
+                        unsigned long ro_size,
+                        unsigned long total_size)
+{
+        /* begin and end PFNs of the current subsection */
+        unsigned long begin_pfn;
+        unsigned long end_pfn;
+        /*
+         * Set RO for module text and RO-data:
+         * - Always protect first page.
+         * - Do not protect last partial page.
+         */
+        if (ro_size > 0)
+                set_page_attributes(base, base + ro_size, set_memory_ro);
+        /*
+         * Set NX permissions for module data:
+         * - Do not protect first partial page.
+         * - Always protect last page.
+         */
+        if (total_size > text_size) {
+                begin_pfn = PFN_UP((unsigned long)base + text_size);
+                end_pfn = PFN_UP((unsigned long)base + total_size);
+                if (end_pfn > begin_pfn)
+                        set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+        }
+}
+static void unset_module_core_ro_nx(struct module *mod)
+{
+        set_page_attributes(mod->module_core + mod->core_text_size,
+                mod->module_core + mod->core_size,
+                set_memory_x);
+        set_page_attributes(mod->module_core,
+                mod->module_core + mod->core_ro_size,
+                set_memory_rw);
+}
+static void unset_module_init_ro_nx(struct module *mod)
+{
+        set_page_attributes(mod->module_init + mod->init_text_size,
+                mod->module_init + mod->init_size,
+                set_memory_x);
+        set_page_attributes(mod->module_init,
+                mod->module_init + mod->init_ro_size,
+                set_memory_rw);
+}
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw(void)
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if ((mod->module_core) && (mod->core_text_size)) {
+                        set_page_attributes(mod->module_core,
+                                                mod->module_core + mod->core_text_size,
+                                                set_memory_rw);
+                }
+                if ((mod->module_init) && (mod->init_text_size)) {
+                        set_page_attributes(mod->module_init,
+                                                mod->module_init + mod->init_text_size,
+                                                set_memory_rw);
+                }
+        }
+        mutex_unlock(&module_mutex);
+}
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro(void)
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if ((mod->module_core) && (mod->core_text_size)) {
+                        set_page_attributes(mod->module_core,
+                                                mod->module_core + mod->core_text_size,
+                                                set_memory_ro);
+                }
+                if ((mod->module_init) && (mod->init_text_size)) {
+                        set_page_attributes(mod->module_init,
+                                                mod->module_init + mod->init_text_size,
+                                                set_memory_ro);
+                }
+        }
+        mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static void unset_module_core_ro_nx(struct module *mod) { }
+static void unset_module_init_ro_nx(struct module *mod) { }
+#endif
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1565,6 +1721,7 @@ static void free_module(struct module *mod)
        destroy_params(mod->kp, mod->num_kp);
        /* This may be NULL, but that's OK */
+        unset_module_init_ro_nx(mod);
        module_free(mod, mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1573,6 +1730,7 @@ static void free_module(struct module *mod)
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
+        unset_module_core_ro_nx(mod);
        module_free(mod, mod->module_core);
 #ifdef CONFIG_MPU
@@ -1776,8 +1934,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
                        DEBUGP("\t%s\n", name);
                }
-                if (m == 0)
+                switch (m) {
+                case 0: /* executable */
+                        mod->core_size = debug_align(mod->core_size);
                        mod->core_text_size = mod->core_size;
+                        break;
+                case 1: /* RO: text and ro-data */
+                        mod->core_size = debug_align(mod->core_size);
+                        mod->core_ro_size = mod->core_size;
+                        break;
+                case 3: /* whole core */
+                        mod->core_size = debug_align(mod->core_size);
+                        break;
+                }
        }
        DEBUGP("Init section allocation order:\n");
@@ -1795,8 +1964,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                         | INIT_OFFSET_MASK);
                        DEBUGP("\t%s\n", sname);
                }
-                if (m == 0)
+                switch (m) {
+                case 0: /* executable */
+                        mod->init_size = debug_align(mod->init_size);
                        mod->init_text_size = mod->init_size;
+                        break;
+                case 1: /* RO: text and ro-data */
+                        mod->init_size = debug_align(mod->init_size);
+                        mod->init_ro_size = mod->init_size;
+                        break;
+                case 3: /* whole init */
+                        mod->init_size = debug_align(mod->init_size);
+                        break;
+                }
        }
 }
@@ -1875,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
        const struct kernel_symbol *start,
        const struct kernel_symbol *stop)
 {
-        const struct kernel_symbol *ks = start;
+        return bsearch(name, start, stop - start,
-        for (; ks < stop; ks++)
+                        sizeof(struct kernel_symbol), cmp_name);
-                if (strcmp(ks->name, name) == 0)
-                        return ks;
-        return NULL;
 }
 static int is_exported(const char *name, unsigned long value,
@@ -2036,7 +2213,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
 {
 }
-static void add_kallsyms(struct module *mod, struct load_info *info)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
 {
 }
 #endif /* CONFIG_KALLSYMS */
@@ -2305,9 +2482,14 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 #endif
 #ifdef CONFIG_TRACEPOINTS
-        mod->tracepoints = section_objs(info, "__tracepoints",
+        mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
-                                        sizeof(*mod->tracepoints),
+                                             sizeof(*mod->tracepoints_ptrs),
-                                        &mod->num_tracepoints);
+                                             &mod->num_tracepoints);
+#endif
+#ifdef HAVE_JUMP_LABEL
+        mod->jump_entries = section_objs(info, "__jump_table",
+                                        sizeof(*mod->jump_entries),
+                                        &mod->num_jump_entries);
 #endif
 #ifdef CONFIG_EVENT_TRACING
        mod->trace_events = section_objs(info, "_ftrace_events",
@@ -2320,6 +2502,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
                           mod->num_trace_events, GFP_KERNEL);
 #endif
+#ifdef CONFIG_TRACING
+        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+                                         sizeof(*mod->trace_bprintk_fmt_start),
+                                         &mod->num_trace_bprintk_fmt);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_bprintk_fmt_start,
+                           sizeof(*mod->trace_bprintk_fmt_start) *
+                           mod->num_trace_bprintk_fmt, GFP_KERNEL);
+#endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
        mod->ftrace_callsites = section_objs(info, "__mcount_loc",
@@ -2605,7 +2799,7 @@ static struct module *load_module(void __user *umod,
        mod->state = MODULE_STATE_COMING;
        /* Now sew it into the lists so we can get lockdep and oops
-         * info during argument parsing.  Noone should access us, since
+         * info during argument parsing.  No one should access us, since
         * strong_try_module_get() will fail.
         * lockdep/oops can run asynchronous, so use the RCU list insertion
         * function to insert in a way safe to concurrent readers.
@@ -2618,7 +2812,7 @@ static struct module *load_module(void __user *umod,
        }
        /* This has to be done once we're sure module name is unique. */
-        if (!mod->taints)
+        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
                dynamic_debug_setup(info.debug, info.num_debug);
        /* Find duplicate symbols */
@@ -2655,7 +2849,7 @@ static struct module *load_module(void __user *umod,
        module_bug_cleanup(mod);
 ddebug:
-        if (!mod->taints)
+        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
                dynamic_debug_remove(info.debug);
 unlock:
        mutex_unlock(&module_mutex);
@@ -2704,6 +2898,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                        MODULE_STATE_COMING, mod);
+        /* Set RO and NX regions for core */
+        set_section_ro_nx(mod->module_core,
+                                mod->core_text_size,
+                                mod->core_ro_size,
+                                mod->core_size);
+        /* Set RO and NX regions for init */
+        set_section_ro_nx(mod->module_init,
+                                mod->init_text_size,
+                                mod->init_ro_size,
+                                mod->init_size);
        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
@@ -2747,9 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mod->symtab = mod->core_symtab;
        mod->strtab = mod->core_strtab;
 #endif
+        unset_module_init_ro_nx(mod);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
+        mod->init_ro_size = 0;
        mod->init_text_size = 0;
        mutex_unlock(&module_mutex);
@@ -2786,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod,
        else
                nextval = (unsigned long)mod->module_core+mod->core_text_size;
-        /* Scan for closest preceeding symbol, and next symbol. (ELF
+        /* Scan for closest preceding symbol, and next symbol. (ELF
           starts real symbols at 1). */
        for (i = 1; i < mod->num_symtab; i++) {
                if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3039,7 +3247,7 @@ static int m_show(struct seq_file *m, void *p)
                   mod->state == MODULE_STATE_COMING ? "Loading":
                   "Live");
        /* Used by oprofile and other similar tools. */
-        seq_printf(m, " 0x%p", mod->module_core);
+        seq_printf(m, " 0x%pK", mod->module_core);
        /* Taints info */
        if (mod->taints)
@@ -3208,7 +3416,7 @@ void module_layout(struct module *mod,
                   struct modversion_info *ver,
                   struct kernel_param *kp,
                   struct kernel_symbol *ks,
-                   struct tracepoint *tp)
+                   struct tracepoint * const *tp)
 {
 }
 EXPORT_SYMBOL(module_layout);
@@ -3222,8 +3430,8 @@ void module_update_tracepoints(void)
        mutex_lock(&module_mutex);
        list_for_each_entry(mod, &modules, list)
                if (!mod->taints)
-                        tracepoint_update_probe_range(mod->tracepoints,
+                        tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                                mod->tracepoints + mod->num_tracepoints);
+                                mod->tracepoints_ptrs + mod->num_tracepoints);
        mutex_unlock(&module_mutex);
 }
@@ -3247,8 +3455,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
                        else if (iter_mod > iter->module)
                                iter->tracepoint = NULL;
                        found = tracepoint_get_iter_range(&iter->tracepoint,
-                                iter_mod->tracepoints,
+                                iter_mod->tracepoints_ptrs,
-                                iter_mod->tracepoints
+                                iter_mod->tracepoints_ptrs
                                        + iter_mod->num_tracepoints);
                        if (found) {
                                iter->module = iter_mod;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
                return;
        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+        DEBUG_LOCKS_WARN_ON(lock->owner != current);
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        mutex_clear_owner(lock);
 }
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
 static inline void mutex_set_owner(struct mutex *lock)
 {
-        lock->owner = current_thread_info();
+        lock->owner = current;
 }
 static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
 */
 static inline int __sched
 __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
-                unsigned long ip)
+                    struct lockdep_map *nest_lock, unsigned long ip)
 {
        struct task_struct *task = current;
        struct mutex_waiter waiter;
        unsigned long flags;
        preempt_disable();
-        mutex_acquire(&lock->dep_map, subclass, 0, ip);
+        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
        /*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
         */
        for (;;) {
-                struct thread_info *owner;
+                struct task_struct *owner;
-                /*
-                 * If we own the BKL, then don't spin. The owner of
-                 * the mutex might be waiting on us to release the BKL.
-                 */
-                if (unlikely(current->lock_depth >= 0))
-                        break;
                /*
                 * If there's an owner, wait for it to either
@@ -199,7 +192,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * memory barriers as we'll eventually observe the right
                 * values at the cost of a few extra spins.
                 */
-                cpu_relax();
+                arch_mutex_cpu_relax();
        }
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                }
                __set_task_state(task, state);
-                /* didnt get the lock, go to sleep: */
+                /* didn't get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
                preempt_enable_no_resched();
                schedule();
@@ -276,16 +269,25 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+        might_sleep();
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
 int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
-        return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-                                   subclass, _RET_IP_);
+                                   subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
 }
 static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 #endif
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
 #ifdef CONFIG_SMP
 static inline void mutex_set_owner(struct mutex *lock)
 {
-        lock->owner = current_thread_info();
+        lock->owner = current;
 }
 static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2a5dfec8efe0..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * ns_cgroup.c - namespace cgroup subsystem
- *
- * Copyright 2006, 2007 IBM Corp
- */
-#include <linux/module.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-struct ns_cgroup {
-        struct cgroup_subsys_state css;
-};
-struct cgroup_subsys ns_subsys;
-static inline struct ns_cgroup *cgroup_to_ns(
-                struct cgroup *cgroup)
-{
-        return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
-                            struct ns_cgroup, css);
-}
-int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
-{
-        char name[PROC_NUMBUF];
-        snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
-        return cgroup_clone(task, &ns_subsys, name);
-}
-/*
- * Rules:
- *   1. you can only enter a cgroup which is a descendant of your current
- *     cgroup
- *   2. you can only place another process into a cgroup if
- *     a. you have CAP_SYS_ADMIN
- *     b. your cgroup is an ancestor of task's destination cgroup
- *       (hence either you are in the same cgroup as task, or in an
- *        ancestor cgroup thereof)
- */
-static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
-                         struct task_struct *task, bool threadgroup)
-{
-        if (current != task) {
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (!cgroup_is_descendant(new_cgroup, current))
-                        return -EPERM;
-        }
-        if (!cgroup_is_descendant(new_cgroup, task))
-                return -EPERM;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (!cgroup_is_descendant(new_cgroup, c)) {
-                                rcu_read_unlock();
-                                return -EPERM;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
-}
-/*
- * Rules: you can only create a cgroup if
- *     1. you are capable(CAP_SYS_ADMIN)
- *     2. the target cgroup is a descendant of your own cgroup
- */
-static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
-                                                struct cgroup *cgroup)
-{
-        struct ns_cgroup *ns_cgroup;
-        if (!capable(CAP_SYS_ADMIN))
-                return ERR_PTR(-EPERM);
-        if (!cgroup_is_descendant(cgroup, current))
-                return ERR_PTR(-EPERM);
-        ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-        if (!ns_cgroup)
-                return ERR_PTR(-ENOMEM);
-        return &ns_cgroup->css;
-}
-static void ns_destroy(struct cgroup_subsys *ss,
-                        struct cgroup *cgroup)
-{
-        struct ns_cgroup *ns_cgroup;
-        ns_cgroup = cgroup_to_ns(cgroup);
-        kfree(ns_cgroup);
-}
-struct cgroup_subsys ns_subsys = {
-        .name = "ns",
-        .can_attach = ns_can_attach,
-        .create = ns_create,
-        .destroy  = ns_destroy,
-        .subsys_id = ns_subsys_id,
-};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
 static struct kmem_cache *nsproxy_cachep;
@@ -69,13 +72,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
                goto out_ns;
        }
-        new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+        new_nsp->uts_ns = copy_utsname(flags, tsk);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }
-        new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+        new_nsp->ipc_ns = copy_ipcs(flags, tsk);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
                goto out;
        }
-        err = ns_cgroup_clone(current, task_pid(current));
-        if (err)
-                put_nsproxy(*new_nsp);
 out:
        return err;
 }
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
        switch_task_namespaces(p, NULL);
 }
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+        const struct proc_ns_operations *ops;
+        struct task_struct *tsk = current;
+        struct nsproxy *new_nsproxy;
+        struct proc_inode *ei;
+        struct file *file;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        file = proc_ns_fget(fd);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        err = -EINVAL;
+        ei = PROC_I(file->f_dentry->d_inode);
+        ops = ei->ns_ops;
+        if (nstype && (ops->type != nstype))
+                goto out;
+        new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+        if (IS_ERR(new_nsproxy)) {
+                err = PTR_ERR(new_nsproxy);
+                goto out;
+        }
+        err = ops->install(new_nsproxy, ei->ns);
+        if (err) {
+                free_nsproxy(new_nsproxy);
+                goto out;
+        }
+        switch_task_namespaces(tsk, new_nsproxy);
+out:
+        fput(file);
+        return err;
+}
 static int __init nsproxy_cache_init(void)
 {
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
                /*
                 * This cpu has to do the parallel processing of the next
                 * object. It's waiting in the cpu's parallelization queue,
-                 * so exit imediately.
+                 * so exit immediately.
                 */
                if (PTR_ERR(padata) == -ENODATA) {
                        del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
        /*
         * The next object that needs serialization might have arrived to
         * the reorder queues in the meantime, we will be called again
-         * from the timer function if noone else cares for it.
+         * from the timer function if no one else cares for it.
         */
        if (atomic_read(&pd->reorder_objects)
                        && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
        put_online_cpus();
 }
-/* Replace the internal control stucture with a new one. */
+/* Replace the internal control structure with a new one. */
 static void padata_replace(struct padata_instance *pinst,
                           struct parallel_data *pd_new)
 {
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 }
 /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
 *                     padata cpumasks.
 *
 * @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -432,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init oops_setup(char *s)
+{
+        if (!s)
+                return -EINVAL;
+        if (!strcmp(s, "panic"))
+                panic_on_oops = 1;
+        return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
-                        /* Noone handled NULL, so do it here. */
+                        /* No one handled NULL, so do it here. */
                        if (!val && params[i].ops->set != param_set_bool)
                                return -EINVAL;
                        DEBUGP("They are equal!  Calling %p\n",
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
 int param_set_bool(const char *val, const struct kernel_param *kp)
 {
        bool v;
+        int ret;
        /* No equals means "set"... */
        if (!val) val = "1";
        /* One of =[yYnN01] */
-        switch (val[0]) {
+        ret = strtobool(val, &v);
-        case 'y': case 'Y': case '1':
+        if (ret)
-                v = true;
+                return ret;
-                break;
-        case 'n': case 'N': case '0':
-                v = false;
-                break;
-        default:
-                return -EINVAL;
-        }
        if (kp->flags & KPARAM_ISBOOL)
                *(bool *)kp->arg = v;
@@ -719,9 +713,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
                        params[i].ops->free(params[i].arg);
 }
-static void __init kernel_add_sysfs_param(const char *name,
+static struct module_kobject * __init locate_module_kobject(const char *name)
-                                          struct kernel_param *kparam,
-                                          unsigned int name_skip)
 {
        struct module_kobject *mk;
        struct kobject *kobj;
@@ -729,10 +721,7 @@ static void __init kernel_add_sysfs_param(const char *name,
        kobj = kset_find_obj(module_kset, name);
        if (kobj) {
-                /* We already have one.  Remove params so we can add more. */
                mk = to_module_kobject(kobj);
-                /* We need to remove it before adding parameters. */
-                sysfs_remove_group(&mk->kobj, &mk->mp->grp);
        } else {
                mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
                BUG_ON(!mk);
@@ -743,15 +732,36 @@ static void __init kernel_add_sysfs_param(const char *name,
                                           "%s", name);
                if (err) {
                        kobject_put(&mk->kobj);
-                        printk(KERN_ERR "Module '%s' failed add to sysfs, "
+                        printk(KERN_ERR
-                               "error number %d\n", name, err);
+                                "Module '%s' failed add to sysfs, error number %d\n",
-                        printk(KERN_ERR "The system will be unstable now.\n");
+                                name, err);
-                        return;
+                        printk(KERN_ERR
+                                "The system will be unstable now.\n");
+                        return NULL;
                }
-                /* So that exit path is even. */
+                /* So that we hold reference in both cases. */
                kobject_get(&mk->kobj);
        }
+        return mk;
+}
+static void __init kernel_add_sysfs_param(const char *name,
+                                          struct kernel_param *kparam,
+                                          unsigned int name_skip)
+{
+        struct module_kobject *mk;
+        int err;
+        mk = locate_module_kobject(name);
+        if (!mk)
+                return;
+        /* We need to remove old parameters before adding more. */
+        if (mk->mp)
+                sysfs_remove_group(&mk->kobj, &mk->mp->grp);
        /* These should not fail at boot. */
        err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
        BUG_ON(err);
@@ -796,6 +806,35 @@ static void __init param_sysfs_builtin(void)
        }
 }
+ssize_t __modver_version_show(struct module_attribute *mattr,
+                              struct module *mod, char *buf)
+{
+        struct module_version_attribute *vattr =
+                container_of(mattr, struct module_version_attribute, mattr);
+        return sprintf(buf, "%s\n", vattr->version);
+}
+extern const struct module_version_attribute *__start___modver[];
+extern const struct module_version_attribute *__stop___modver[];
+static void __init version_sysfs_builtin(void)
+{
+        const struct module_version_attribute **p;
+        struct module_kobject *mk;
+        int err;
+        for (p = __start___modver; p < __stop___modver; p++) {
+                const struct module_version_attribute *vattr = *p;
+                mk = locate_module_kobject(vattr->module_name);
+                if (mk) {
+                        err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+                        kobject_uevent(&mk->kobj, KOBJ_ADD);
+                        kobject_put(&mk->kobj);
+                }
+        }
+}
 /* module-related sysfs stuff */
@@ -875,6 +914,7 @@ static int __init param_sysfs_init(void)
        }
        module_sysfs_initialized = 1;
+        version_sysfs_builtin();
        param_sysfs_builtin();
        return 0;
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
        return -1;
 }
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 {
        int offset;
        struct pidmap *map, *end;
+        if (last >= PID_MAX_LIMIT)
+                return -1;
        offset = (last + 1) & BITS_PER_PAGE_MASK;
        map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
        end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -401,7 +404,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference_check(pid->tasks[type].first,
+                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              rcu_read_lock_held() ||
                                              lockdep_tasklist_lock_is_held());
                if (first)
@@ -416,6 +419,7 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
+        rcu_lockdep_assert(rcu_read_lock_held());
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
@@ -434,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
        rcu_read_unlock();
        return pid;
 }
+EXPORT_SYMBOL_GPL(get_task_pid);
 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
@@ -445,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
        rcu_read_unlock();
        return result;
 }
+EXPORT_SYMBOL_GPL(get_pid_task);
 struct pid *find_get_pid(pid_t nr)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
+#include <linux/proc_fs.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 {
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
-        int i;
+        int i, err = -ENOMEM;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
        for (i = 1; i < PIDMAP_ENTRIES; i++)
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+        err = pid_ns_prepare_proc(ns);
+        if (err)
+                goto out_put_parent_pid_ns;
        return ns;
+out_put_parent_pid_ns:
+        put_pid_ns(parent_pid_ns);
 out_free_map:
        kfree(ns->pidmap[0].page);
 out_free:
        kmem_cache_free(pid_ns_cachep, ns);
 out:
-        return ERR_PTR(-ENOMEM);
+        return ERR_PTR(err);
 }
 static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/platform_device.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/uaccess.h>
@@ -53,11 +54,17 @@ enum pm_qos_type {
        PM_QOS_MIN              /* return the smallest value */
 };
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically.  Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
 struct pm_qos_object {
        struct plist_head requests;
        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
+        s32 target_value;       /* Do not change to 64 bit */
        s32 default_value;
        enum pm_qos_type type;
 };
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
-        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+        .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN,
 };
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
-        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+        .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN
 };
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
-        .default_value = 0,
+        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+        .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .type = PM_QOS_MAX,
 };
@@ -103,13 +113,17 @@ static struct pm_qos_object *pm_qos_array[] = {
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+                size_t count, loff_t *f_pos);
 static int pm_qos_power_open(struct inode *inode, struct file *filp);
 static int pm_qos_power_release(struct inode *inode, struct file *filp);
 static const struct file_operations pm_qos_power_fops = {
        .write = pm_qos_power_write,
+        .read = pm_qos_power_read,
        .open = pm_qos_power_open,
        .release = pm_qos_power_release,
+        .llseek = noop_llseek,
 };
 /* unlocked internal variant */
@@ -120,10 +134,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        switch (o->type) {
        case PM_QOS_MIN:
-                return plist_last(&o->requests)->prio;
+                return plist_first(&o->requests)->prio;
        case PM_QOS_MAX:
-                return plist_first(&o->requests)->prio;
+                return plist_last(&o->requests)->prio;
        default:
                /* runtime check for not using enum */
@@ -131,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        }
 }
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+        return o->target_value;
+}
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+        o->target_value = value;
+}
 static void update_target(struct pm_qos_object *o, struct plist_node *node,
                          int del, int value)
 {
@@ -155,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
                plist_add(node, &o->requests);
        }
        curr_value = pm_qos_get_value(o);
+        pm_qos_set_value(o, curr_value);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
        if (prev_value != curr_value)
@@ -189,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
 * pm_qos_request - returns current system wide qos expectation
 * @pm_qos_class: identification of which qos value is requested
 *
- * This function returns the current target value in an atomic manner.
+ * This function returns the current target value.
 */
 int pm_qos_request(int pm_qos_class)
 {
-        unsigned long flags;
+        return pm_qos_read_value(pm_qos_array[pm_qos_class]);
-        int value;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return value;
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
@@ -375,30 +393,63 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
 }
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+                size_t count, loff_t *f_pos)
+{
+        s32 value;
+        unsigned long flags;
+        struct pm_qos_object *o;
+        struct pm_qos_request_list *pm_qos_req = filp->private_data;
+        if (!pm_qos_req)
+                return -EINVAL;
+        if (!pm_qos_request_active(pm_qos_req))
+                return -EINVAL;
+        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        value = pm_qos_get_value(o);
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
        s32 value;
-        int x;
-        char ascii_value[11];
        struct pm_qos_request_list *pm_qos_req;
        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
                        return -EFAULT;
-        } else if (count == 11) { /* len('0x12345678/0') */
+        } else if (count <= 11) { /* ASCII perhaps? */
-                if (copy_from_user(ascii_value, buf, 11))
+                char ascii_value[11];
+                unsigned long int ulval;
+                int ret;
+                if (copy_from_user(ascii_value, buf, count))
                        return -EFAULT;
-                if (strlen(ascii_value) != 10)
-                        return -EINVAL;
+                if (count > 10) {
-                x = sscanf(ascii_value, "%x", &value);
+                        if (ascii_value[10] == '\n')
-                if (x != 1)
+                                ascii_value[10] = '\0';
+                        else
+                                return -EINVAL;
+                } else {
+                        ascii_value[count] = '\0';
+                }
+                ret = strict_strtoul(ascii_value, 16, &ulval);
+                if (ret) {
+                        pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
                        return -EINVAL;
-                pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
+                }
-        } else
+                value = (s32)lower_32_bits(ulval);
+        } else {
                return -EINVAL;
+        }
-        pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
+        pm_qos_req = filp->private_data;
        pm_qos_update_request(pm_qos_req, value);
        return count;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
        if (pid == 0)
                return 0;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
-                   same_thread_group(p, current) : thread_group_leader(p))) {
+                   same_thread_group(p, current) : has_group_leader_pid(p))) {
                error = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
        return p->utime;
 }
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
        int error = check_clock(which_clock);
        if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
        return error;
 }
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
        /*
         * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 }
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
 * new timer already all-zeros initialized.
 */
-int posix_cpu_timer_create(struct k_itimer *new_timer)
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
        int ret = 0;
        const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -390,7 +392,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
                if (pid == 0) {
                        p = current;
@@ -404,7 +406,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                        p = current->group_leader;
                } else {
                        p = find_task_by_vpid(pid);
-                        if (p && !thread_group_leader(p))
+                        if (p && !has_group_leader_pid(p))
                                p = NULL;
                }
        }
@@ -414,7 +416,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        } else {
                ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
-int posix_cpu_timer_del(struct k_itimer *timer)
+static int posix_cpu_timer_del(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
        int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
-                        struct itimerspec *new, struct itimerspec *old)
+                               struct itimerspec *new, struct itimerspec *old)
 {
        struct task_struct *p = timer->it.cpu.task;
        union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        return ret;
 }
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
        union cpu_time_count now;
        struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
        /*
         * Now that all the timers on our list have the firing flag,
-         * noone will touch their list entries but us.  We'll take
+         * no one will touch their list entries but us.  We'll take
         * each timer's lock before clearing its firing flag, so no
         * timer call will interfere.
         */
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
        return error;
 }
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
-                     struct timespec *rqtp, struct timespec __user *rmtp)
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+                            struct timespec *rqtp, struct timespec __user *rmtp)
 {
        struct restart_block *restart_block =
-            &current_thread_info()->restart_block;
+                &current_thread_info()->restart_block;
        struct itimerspec it;
        int error;
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
        if (error == -ERESTART_RESTARTBLOCK) {
-                if (flags & TIMER_ABSTIME)
+                if (flags & TIMER_ABSTIME)
                        return -ERESTARTNOHAND;
                /*
-                 * Report back to the user the time still remaining.
+                 * Report back to the user the time still remaining.
-                 */
+                 */
-                if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
                restart_block->fn = posix_cpu_nsleep_restart;
-                restart_block->arg0 = which_clock;
+                restart_block->nanosleep.clockid = which_clock;
-                restart_block->arg1 = (unsigned long) rmtp;
+                restart_block->nanosleep.rmtp = rmtp;
-                restart_block->arg2 = rqtp->tv_sec;
+                restart_block->nanosleep.expires = timespec_to_ns(rqtp);
-                restart_block->arg3 = rqtp->tv_nsec;
        }
        return error;
 }
-long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
-        clockid_t which_clock = restart_block->arg0;
+        clockid_t which_clock = restart_block->nanosleep.clockid;
-        struct timespec __user *rmtp;
        struct timespec t;
        struct itimerspec it;
        int error;
-        rmtp = (struct timespec __user *) restart_block->arg1;
+        t = ns_to_timespec(restart_block->nanosleep.expires);
-        t.tv_sec = restart_block->arg2;
-        t.tv_nsec = restart_block->arg3;
-        restart_block->fn = do_no_restart_syscall;
        error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
        if (error == -ERESTART_RESTARTBLOCK) {
+                struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
                /*
-                 * Report back to the user the time still remaining.
+                 * Report back to the user the time still remaining.
-                 */
+                 */
-                if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
-                restart_block->fn = posix_cpu_nsleep_restart;
+                restart_block->nanosleep.expires = timespec_to_ns(&t);
-                restart_block->arg0 = which_clock;
-                restart_block->arg1 = (unsigned long) rmtp;
-                restart_block->arg2 = t.tv_sec;
-                restart_block->arg3 = t.tv_nsec;
        }
        return error;
 }
 #define PROCESS_CLOCK   MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK    MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
        timer->it_clock = THREAD_CLOCK;
        return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
-                              struct timespec *rqtp, struct timespec __user *rmtp)
+struct k_clock clock_posix_cpu = {
-{
+        .clock_getres   = posix_cpu_clock_getres,
-        return -EINVAL;
+        .clock_set      = posix_cpu_clock_set,
-}
+        .clock_get      = posix_cpu_clock_get,
-static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+        .timer_create   = posix_cpu_timer_create,
-{
+        .nsleep         = posix_cpu_nsleep,
-        return -EINVAL;
+        .nsleep_restart = posix_cpu_nsleep_restart,
-}
+        .timer_set      = posix_cpu_timer_set,
+        .timer_del      = posix_cpu_timer_del,
+        .timer_get      = posix_cpu_timer_get,
+};
 static __init int init_posix_cpu_timers(void)
 {
        struct k_clock process = {
-                .clock_getres = process_cpu_clock_getres,
+                .clock_getres   = process_cpu_clock_getres,
-                .clock_get = process_cpu_clock_get,
+                .clock_get      = process_cpu_clock_get,
-                .clock_set = do_posix_clock_nosettime,
+                .timer_create   = process_cpu_timer_create,
-                .timer_create = process_cpu_timer_create,
+                .nsleep         = process_cpu_nsleep,
-                .nsleep = process_cpu_nsleep,
+                .nsleep_restart = process_cpu_nsleep_restart,
-                .nsleep_restart = process_cpu_nsleep_restart,
        };
        struct k_clock thread = {
-                .clock_getres = thread_cpu_clock_getres,
+                .clock_getres   = thread_cpu_clock_getres,
-                .clock_get = thread_cpu_clock_get,
+                .clock_get      = thread_cpu_clock_get,
-                .clock_set = do_posix_clock_nosettime,
+                .timer_create   = thread_cpu_timer_create,
-                .timer_create = thread_cpu_timer_create,
-                .nsleep = thread_cpu_nsleep,
-                .nsleep_restart = thread_cpu_nsleep_restart,
        };
        struct timespec ts;
-        register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+        posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
-        register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+        posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
        cputime_to_timespec(cputime_one_jiffy, &ts);
        onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/idr.h>
+#include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
 #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
 #endif
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
 /*
 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
 /*
 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
 *          to implement others.  This structure defines the various
- *          clocks and allows the possibility of adding others.  We
+ *          clocks.
- *          provide an interface to add clocks to the table and expect
- *          the "arch" code to add at least one clock that is high
- *          resolution.  Here we define the standard CLOCK_REALTIME as a
- *          1/HZ resolution clock.
 *
 * RESOLUTION: Clock resolution is used to round up timer and interval
 *          times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
 *          necessary code is written.  The standard says we should say
 *          something about this issue in the documentation...
 *
- * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
- *          various clock functions.  For clocks that use the standard
+ *          handle various clock functions.
- *          system timer code these entries should be NULL.  This will
- *          allow dispatch without the overhead of indirect function
- *          calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
- *          must supply functions here, even if the function just returns
- *          ENOSYS.  The standard POSIX timer management code assumes the
- *          following: 1.) The k_itimer struct (sched.h) is used for the
- *          timer.  2.) The list, it_lock, it_clock, it_id and it_pid
- *          fields are not modified by timer code.
 *
- *          At this time all functions EXCEPT clock_nanosleep can be
+ *          The standard POSIX timer management code assumes the
- *          redirected by the CLOCKS structure.  Clock_nanosleep is in
+ *          following: 1.) The k_itimer struct (sched.h) is used for
- *          there, but the code ignores it.
+ *          the timer.  2.) The list, it_lock, it_clock, it_id and
+ *          it_pid fields are not modified by timer code.
 *
 * Permissions: It is assumed that the clock_settime() function defined
 *          for each clock will take care of permission checks.  Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 */
 static int common_nsleep(const clockid_t, int flags, struct timespec *t,
                         struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
                            struct itimerspec *, struct itimerspec *);
@@ -145,83 +144,37 @@ static int common_timer_del(struct k_itimer *timer);
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+#define lock_timer(tid, flags)                                             \
+({      struct k_itimer *__timr;                                           \
+        __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
+        __timr;                                                            \
+})
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
        spin_unlock_irqrestore(&timr->it_lock, flags);
 }
-/*
+/* Get clock_realtime */
- * Call the k_clock hook function if non-null, or the default function.
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
- */
-#define CLOCK_DISPATCH(clock, call, arglist) \
-        ((clock) < 0 ? posix_cpu_##call arglist : \
-         (posix_clocks[clock].call != NULL \
-          ? (*posix_clocks[clock].call) arglist : common_##call arglist))
-/*
- * Default clock hook functions when the struct k_clock passed
- * to register_posix_clock leaves a function pointer null.
- *
- * The function common_CALL is the default implementation for
- * the function pointer CALL in struct k_clock.
- */
-static inline int common_clock_getres(const clockid_t which_clock,
-                                      struct timespec *tp)
-{
-        tp->tv_sec = 0;
-        tp->tv_nsec = posix_clocks[which_clock].res;
-        return 0;
-}
-/*
- * Get real time for posix timers
- */
-static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 {
        ktime_get_real_ts(tp);
        return 0;
 }
-static inline int common_clock_set(const clockid_t which_clock,
+/* Set clock_realtime */
-                                   struct timespec *tp)
+static int posix_clock_realtime_set(const clockid_t which_clock,
+                                    const struct timespec *tp)
 {
        return do_sys_settimeofday(tp, NULL);
 }
-static int common_timer_create(struct k_itimer *new_timer)
+static int posix_clock_realtime_adj(const clockid_t which_clock,
-{
+                                    struct timex *t)
-        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
-        return 0;
-}
-static int no_timer_create(struct k_itimer *new_timer)
-{
-        return -EOPNOTSUPP;
-}
-static int no_nsleep(const clockid_t which_clock, int flags,
-                     struct timespec *tsave, struct timespec __user *rmtp)
 {
-        return -EOPNOTSUPP;
+        return do_adjtimex(t);
-}
-/*
- * Return nonzero if we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(const clockid_t which_clock)
-{
-        if (which_clock < 0)    /* CPU clock, posix_cpu_* will check it */
-                return 0;
-        if ((unsigned) which_clock >= MAX_CLOCKS)
-                return 1;
-        if (posix_clocks[which_clock].clock_getres != NULL)
-                return 0;
-        if (posix_clocks[which_clock].res != 0)
-                return 0;
-        return 1;
 }
 /*
@@ -234,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 }
 /*
- * Get monotonic time for posix timers
+ * Get monotonic-raw time for posix timers
 */
 static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 {
@@ -261,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
        *tp = ktime_to_timespec(KTIME_LOW_RES);
        return 0;
 }
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+        get_monotonic_boottime(tp);
+        return 0;
+}
 /*
 * Initialize everything, well, just everything in Posix clocks/timers ;)
 */
 static __init int init_posix_timers(void)
 {
        struct k_clock clock_realtime = {
-                .clock_getres = hrtimer_get_res,
+                .clock_getres   = hrtimer_get_res,
+                .clock_get      = posix_clock_realtime_get,
+                .clock_set      = posix_clock_realtime_set,
+                .clock_adj      = posix_clock_realtime_adj,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
        };
        struct k_clock clock_monotonic = {
-                .clock_getres = hrtimer_get_res,
+                .clock_getres   = hrtimer_get_res,
-                .clock_get = posix_ktime_get_ts,
+                .clock_get      = posix_ktime_get_ts,
-                .clock_set = do_posix_clock_nosettime,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
        };
        struct k_clock clock_monotonic_raw = {
-                .clock_getres = hrtimer_get_res,
+                .clock_getres   = hrtimer_get_res,
-                .clock_get = posix_get_monotonic_raw,
+                .clock_get      = posix_get_monotonic_raw,
-                .clock_set = do_posix_clock_nosettime,
-                .timer_create = no_timer_create,
-                .nsleep = no_nsleep,
        };
        struct k_clock clock_realtime_coarse = {
-                .clock_getres = posix_get_coarse_res,
+                .clock_getres   = posix_get_coarse_res,
-                .clock_get = posix_get_realtime_coarse,
+                .clock_get      = posix_get_realtime_coarse,
-                .clock_set = do_posix_clock_nosettime,
-                .timer_create = no_timer_create,
-                .nsleep = no_nsleep,
        };
        struct k_clock clock_monotonic_coarse = {
-                .clock_getres = posix_get_coarse_res,
+                .clock_getres   = posix_get_coarse_res,
-                .clock_get = posix_get_monotonic_coarse,
+                .clock_get      = posix_get_monotonic_coarse,
-                .clock_set = do_posix_clock_nosettime,
+        };
-                .timer_create = no_timer_create,
+        struct k_clock clock_boottime = {
-                .nsleep = no_nsleep,
+                .clock_getres   = hrtimer_get_res,
+                .clock_get      = posix_get_boottime,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
        };
-        register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+        posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
-        register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+        posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
-        register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+        posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
-        register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+        posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
-        register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+        posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+        posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -336,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 * restarted (i.e. we have flagged this in the sys_private entry of the
 * info block).
 *
- * To protect aginst the timer going away while the interrupt is queued,
+ * To protect against the timer going away while the interrupt is queued,
 * we require that the it_requeue_pending flag be set.
 */
 void do_schedule_next_timer(struct siginfo *info)
@@ -476,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
        return task_pid(rtn);
 }
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+void posix_timers_register_clock(const clockid_t clock_id,
+                                 struct k_clock *new_clock)
 {
        if ((unsigned) clock_id >= MAX_CLOCKS) {
-                printk("POSIX clock register failed for clock_id %d\n",
+                printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+                       clock_id);
+                return;
+        }
+        if (!new_clock->clock_get) {
+                printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+                       clock_id);
+                return;
+        }
+        if (!new_clock->clock_getres) {
+                printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
                       clock_id);
                return;
        }
        posix_clocks[clock_id] = *new_clock;
 }
-EXPORT_SYMBOL_GPL(register_posix_clock);
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
 static struct k_itimer * alloc_posix_timer(void)
 {
@@ -502,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
        return tmr;
 }
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+        struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+        kmem_cache_free(posix_timers_cache, tmr);
+}
 #define IT_ID_SET       1
 #define IT_ID_NOT_SET   0
 static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -514,7 +510,24 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
        }
        put_pid(tmr->it_pid);
        sigqueue_free(tmr->sigq);
-        kmem_cache_free(posix_timers_cache, tmr);
+        call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
+}
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+        if (id < 0)
+                return (id & CLOCKFD_MASK) == CLOCKFD ?
+                        &clock_posix_dynamic : &clock_posix_cpu;
+        if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+                return NULL;
+        return &posix_clocks[id];
+}
+static int common_timer_create(struct k_itimer *new_timer)
+{
+        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+        return 0;
 }
 /* Create a POSIX.1b interval timer. */
@@ -523,13 +536,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                struct sigevent __user *, timer_event_spec,
                timer_t __user *, created_timer_id)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct k_itimer *new_timer;
        int error, new_timer_id;
        sigevent_t event;
        int it_id_set = IT_ID_NOT_SET;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
+        if (!kc->timer_create)
+                return -EOPNOTSUPP;
        new_timer = alloc_posix_timer();
        if (unlikely(!new_timer))
@@ -591,7 +607,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                goto out;
        }
-        error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+        error = kc->timer_create(new_timer);
        if (error)
                goto out;
@@ -601,7 +617,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
        spin_unlock_irq(&current->sighand->siglock);
        return 0;
-        /*
+        /*
         * In the case of the timer belonging to another task, after
         * the task is unlocked, the timer is owned by the other task
         * and may cease to exist at any time.  Don't use or modify
@@ -619,25 +635,21 @@ out:
 * the find to the timer lock.  To avoid a dead lock, the timer id MUST
 * be release with out holding the timer lock.
 */
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
-        /*
-         * Watch out here.  We do a irqsave on the idr_lock and pass the
+        rcu_read_lock();
-         * flags part over to the timer lock.  Must not let interrupts in
-         * while we are moving the lock.
-         */
-        spin_lock_irqsave(&idr_lock, *flags);
        timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
-                spin_lock(&timr->it_lock);
+                spin_lock_irqsave(&timr->it_lock, *flags);
                if (timr->it_signal == current->signal) {
-                        spin_unlock(&idr_lock);
+                        rcu_read_unlock();
                        return timr;
                }
-                spin_unlock(&timr->it_lock);
+                spin_unlock_irqrestore(&timr->it_lock, *flags);
        }
-        spin_unlock_irqrestore(&idr_lock, *flags);
+        rcu_read_unlock();
        return NULL;
 }
@@ -703,22 +715,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct itimerspec __user *, setting)
 {
-        struct k_itimer *timr;
        struct itimerspec cur_setting;
+        struct k_itimer *timr;
+        struct k_clock *kc;
        unsigned long flags;
+        int ret = 0;
        timr = lock_timer(timer_id, &flags);
        if (!timr)
                return -EINVAL;
-        CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+        kc = clockid_to_kclock(timr->it_clock);
+        if (WARN_ON_ONCE(!kc || !kc->timer_get))
+                ret = -EINVAL;
+        else
+                kc->timer_get(timr, &cur_setting);
        unlock_timer(timr, flags);
-        if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+        if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
                return -EFAULT;
-        return 0;
+        return ret;
 }
 /*
@@ -807,6 +825,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
        int error = 0;
        unsigned long flag;
        struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+        struct k_clock *kc;
        if (!new_setting)
                return -EINVAL;
@@ -822,8 +841,11 @@ retry:
        if (!timr)
                return -EINVAL;
-        error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+        kc = clockid_to_kclock(timr->it_clock);
-                               (timr, flags, &new_spec, rtn));
+        if (WARN_ON_ONCE(!kc || !kc->timer_set))
+                error = -EINVAL;
+        else
+                error = kc->timer_set(timr, flags, &new_spec, rtn);
        unlock_timer(timr, flag);
        if (error == TIMER_RETRY) {
@@ -838,7 +860,7 @@ retry:
        return error;
 }
-static inline int common_timer_del(struct k_itimer *timer)
+static int common_timer_del(struct k_itimer *timer)
 {
        timer->it.real.interval.tv64 = 0;
@@ -849,7 +871,11 @@ static inline int common_timer_del(struct k_itimer *timer)
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
-        return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+        struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+        if (WARN_ON_ONCE(!kc || !kc->timer_del))
+                return -EINVAL;
+        return kc->timer_del(timer);
 }
 /* Delete a POSIX.1b interval timer. */
@@ -921,69 +947,76 @@ void exit_itimers(struct signal_struct *sig)
        }
 }
-/* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
-{
-        return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
-int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
-                               struct timespec *t, struct timespec __user *r)
-{
-#ifndef ENOTSUP
-        return -EOPNOTSUPP;     /* aka ENOTSUP in userland for POSIX */
-#else  /*  parisc does define it separately.  */
-        return -ENOTSUP;
-#endif
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct timespec __user *, tp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec new_tp;
-        if (invalid_clockid(which_clock))
+        if (!kc || !kc->clock_set)
                return -EINVAL;
        if (copy_from_user(&new_tp, tp, sizeof (*tp)))
                return -EFAULT;
-        return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+        return kc->clock_set(which_clock, &new_tp);
 }
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct timespec __user *,tp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec kernel_tp;
        int error;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
-        error = CLOCK_DISPATCH(which_clock, clock_get,
-                               (which_clock, &kernel_tp));
+        error = kc->clock_get(which_clock, &kernel_tp);
        if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
                error = -EFAULT;
        return error;
+}
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+                struct timex __user *, utx)
+{
+        struct k_clock *kc = clockid_to_kclock(which_clock);
+        struct timex ktx;
+        int err;
+        if (!kc)
+                return -EINVAL;
+        if (!kc->clock_adj)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&ktx, utx, sizeof(ktx)))
+                return -EFAULT;
+        err = kc->clock_adj(which_clock, &ktx);
+        if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+                return -EFAULT;
+        return err;
 }
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct timespec __user *, tp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec rtn_tp;
        int error;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
-        error = CLOCK_DISPATCH(which_clock, clock_getres,
+        error = kc->clock_getres(which_clock, &rtn_tp);
-                               (which_clock, &rtn_tp));
-        if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+        if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
                error = -EFAULT;
-        }
        return error;
 }
@@ -1003,10 +1036,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct timespec __user *, rqtp,
                struct timespec __user *, rmtp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec t;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
+        if (!kc->nsleep)
+                return -ENANOSLEEP_NOTSUP;
        if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
                return -EFAULT;
@@ -1014,27 +1050,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
        if (!timespec_valid(&t))
                return -EINVAL;
-        return CLOCK_DISPATCH(which_clock, nsleep,
+        return kc->nsleep(which_clock, flags, &t, rmtp);
-                              (which_clock, flags, &t, rmtp));
-}
-/*
- * nanosleep_restart for monotonic and realtime clocks
- */
-static int common_nsleep_restart(struct restart_block *restart_block)
-{
-        return hrtimer_nanosleep_restart(restart_block);
 }
 /*
 * This will restart clock_nanosleep. This is required only by
 * compat_clock_nanosleep_restart for now.
 */
-long
+long clock_nanosleep_restart(struct restart_block *restart_block)
-clock_nanosleep_restart(struct restart_block *restart_block)
 {
-        clockid_t which_clock = restart_block->arg0;
+        clockid_t which_clock = restart_block->nanosleep.clockid;
+        struct k_clock *kc = clockid_to_kclock(which_clock);
+        if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+                return -EINVAL;
-        return CLOCK_DISPATCH(which_clock, nsleep_restart,
+        return kc->nsleep_restart(restart_block);
-                              (restart_block));
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,128 +1,12 @@
-config PM
-        bool "Power Management support"
-        depends on !IA64_HP_SIM
-        ---help---
-          "Power Management" means that parts of your computer are shut
-          off or put into a power conserving "sleep" mode if they are not
-          being used.  There are two competing standards for doing this: APM
-          and ACPI.  If you want to use either one, say Y here and then also
-          to the requisite support below.
-          Power Management is most important for battery powered laptop
-          computers; if you have a laptop, check out the Linux Laptop home
-          page on the WWW at <http://www.linux-on-laptops.com/> or
-          Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
-          and the Battery Powered Linux mini-HOWTO, available from
-          <http://www.tldp.org/docs.html#howto>.
-          Note that, even if you say N here, Linux on the x86 architecture
-          will issue the hlt instruction if nothing is to be done, thereby
-          sending the processor to sleep and saving power.
-config PM_DEBUG
-        bool "Power Management Debug Support"
-        depends on PM
-        ---help---
-        This option enables various debugging support in the Power Management
-        code. This is helpful when debugging and reporting PM bugs, like
-        suspend support.
-config PM_ADVANCED_DEBUG
-        bool "Extra PM attributes in sysfs for low-level debugging/testing"
-        depends on PM_DEBUG
-        default n
-        ---help---
-        Add extra sysfs attributes allowing one to access some Power Management
-        fields of device objects from user space.  If you are not a kernel
-        developer interested in debugging/testing Power Management, say "no".
-config PM_VERBOSE
-        bool "Verbose Power Management debugging"
-        depends on PM_DEBUG
-        default n
-        ---help---
-        This option enables verbose messages from the Power Management code.
-config CAN_PM_TRACE
-        def_bool y
-        depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
-config PM_TRACE
-        bool
-        help
-          This enables code to save the last PM event point across
-          reboot. The architecture needs to support this, x86 for
-          example does by saving things in the RTC, see below.
-          The architecture specific code must provide the extern
-          functions from <linux/resume-trace.h> as well as the
-          <asm/resume-trace.h> header with a TRACE_RESUME() macro.
-          The way the information is presented is architecture-
-          dependent, x86 will print the information during a
-          late_initcall.
-config PM_TRACE_RTC
-        bool "Suspend/resume event tracing"
-        depends on CAN_PM_TRACE
-        depends on X86
-        select PM_TRACE
-        default n
-        ---help---
-        This enables some cheesy code to save the last PM event point in the
-        RTC across reboots, so that you can debug a machine that just hangs
-        during suspend (or more commonly, during resume).
-        To use this debugging feature you should attempt to suspend the
-        machine, reboot it and then run
-                dmesg -s 1000000 | grep 'hash matches'
-        CAUTION: this option will cause your machine's real-time clock to be
-        set to an invalid time after a resume.
-config PM_SLEEP_SMP
-        bool
-        depends on SMP
-        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
-        depends on PM_SLEEP
-        select HOTPLUG_CPU
-        default y
-config PM_SLEEP
-        bool
-        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
-        default y
-config PM_SLEEP_ADVANCED_DEBUG
-        bool
-        depends on PM_ADVANCED_DEBUG
-        default n
-config SUSPEND_NVS
-       bool
 config SUSPEND
        bool "Suspend to RAM and standby"
-        depends on PM && ARCH_SUSPEND_POSSIBLE
+        depends on ARCH_SUSPEND_POSSIBLE
-        select SUSPEND_NVS if HAS_IOMEM
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
          powered and thus its contents are preserved, such as the
          suspend-to-RAM state (e.g. the ACPI S3 state).
-config PM_TEST_SUSPEND
-        bool "Test suspend/resume and wakealarm during bootup"
-        depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
-        ---help---
-        This option will let you suspend your machine during bootup, and
-        make it wake up a few seconds later using an RTC wakeup alarm.
-        Enable this with a kernel parameter like "test_suspend=mem".
-        You probably want to have your system's RTC driver statically
-        linked, ensuring that it's available when this test runs.
 config SUSPEND_FREEZER
        bool "Enable freezer for suspend to RAM/standby" \
                if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -134,10 +18,15 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATE_CALLBACKS
+        bool
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
-        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+        depends on SWAP && ARCH_HIBERNATION_POSSIBLE
-        select SUSPEND_NVS if HAS_IOMEM
+        select HIBERNATE_CALLBACKS
+        select LZO_COMPRESS
+        select LZO_DECOMPRESS
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
@@ -198,6 +87,100 @@ config PM_STD_PARTITION
          suspended image to. It will simply pick the first available swap 
          device.
+config PM_SLEEP
+        def_bool y
+        depends on SUSPEND || HIBERNATE_CALLBACKS
+config PM_SLEEP_SMP
+        def_bool y
+        depends on SMP
+        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+        depends on PM_SLEEP
+        select HOTPLUG
+        select HOTPLUG_CPU
+config PM_RUNTIME
+        bool "Run-time PM core functionality"
+        depends on !IA64_HP_SIM
+        ---help---
+          Enable functionality allowing I/O devices to be put into energy-saving
+          (low power) states at run time (or autosuspended) after a specified
+          period of inactivity and woken up in response to a hardware-generated
+          wake-up event or a driver's request.
+          Hardware support is generally required for this functionality to work
+          and the bus type drivers of the buses the devices are on are
+          responsible for the actual handling of the autosuspend requests and
+          wake-up events.
+config PM
+        def_bool y
+        depends on PM_SLEEP || PM_RUNTIME
+config PM_DEBUG
+        bool "Power Management Debug Support"
+        depends on PM
+        ---help---
+        This option enables various debugging support in the Power Management
+        code. This is helpful when debugging and reporting PM bugs, like
+        suspend support.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
+config PM_TEST_SUSPEND
+        bool "Test suspend/resume and wakealarm during bootup"
+        depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+        ---help---
+        This option will let you suspend your machine during bootup, and
+        make it wake up a few seconds later using an RTC wakeup alarm.
+        Enable this with a kernel parameter like "test_suspend=mem".
+        You probably want to have your system's RTC driver statically
+        linked, ensuring that it's available when this test runs.
+config CAN_PM_TRACE
+        def_bool y
+        depends on PM_DEBUG && PM_SLEEP
+config PM_TRACE
+        bool
+        help
+          This enables code to save the last PM event point across
+          reboot. The architecture needs to support this, x86 for
+          example does by saving things in the RTC, see below.
+          The architecture specific code must provide the extern
+          functions from <linux/resume-trace.h> as well as the
+          <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+          The way the information is presented is architecture-
+          dependent, x86 will print the information during a
+          late_initcall.
+config PM_TRACE_RTC
+        bool "Suspend/resume event tracing"
+        depends on CAN_PM_TRACE
+        depends on X86
+        select PM_TRACE
+        ---help---
+        This enables some cheesy code to save the last PM event point in the
+        RTC across reboots, so that you can debug a machine that just hangs
+        during suspend (or more commonly, during resume).
+        To use this debugging feature you should attempt to suspend the
+        machine, reboot it and then run
+                dmesg -s 1000000 | grep 'hash matches'
+        CAUTION: this option will cause your machine's real-time clock to be
+        set to an invalid time after a resume.
 config APM_EMULATION
        tristate "Advanced Power Management Emulation"
        depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -224,21 +207,23 @@ config APM_EMULATION
          anything, try disabling/enabling this option (or disabling/enabling
          APM in your BIOS).
-config PM_RUNTIME
+config ARCH_HAS_OPP
-        bool "Run-time PM core functionality"
+        bool
-        depends on PM
+config PM_OPP
+        bool "Operating Performance Point (OPP) Layer library"
+        depends on ARCH_HAS_OPP
        ---help---
-          Enable functionality allowing I/O devices to be put into energy-saving
+          SOCs have a standard set of tuples consisting of frequency and
-          (low power) states at run time (or autosuspended) after a specified
+          voltage pairs that the device will support per voltage domain. This
-          period of inactivity and woken up in response to a hardware-generated
+          is called Operating Performance Point or OPP. The actual definitions
-          wake-up event or a driver's request.
+          of OPP varies over silicon within the same family of devices.
-          Hardware support is generally required for this functionality to work
+          OPP layer organizes the data internally using device pointers
-          and the bus type drivers of the buses the devices are on are
+          representing individual voltage domains and provides SOC
-          responsible for the actual handling of the autosuspend requests and
+          implementations a ready to use framework to manage OPPs.
-          wake-up events.
+          For more information, read <file:Documentation/power/opp.txt>
-config PM_OPS
+config PM_RUNTIME_CLK
-        bool
+        def_bool y
-        depends on PM_SLEEP || PM_RUNTIME
+        depends on PM_RUNTIME && HAVE_CLK
-        default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,5 @@
-ifeq ($(CONFIG_PM_DEBUG),y)
+ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
-EXTRA_CFLAGS    +=      -DDEBUG
-endif
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_PM_SLEEP)          += console.o
@@ -10,6 +8,5 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
-obj-$(CONFIG_SUSPEND_NVS)       += nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
                struct page *page, struct bio **bio_chain)
 {
-        const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+        const int bio_rw = rw | REQ_SYNC;
        struct bio *bio;
        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,12 +23,13 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
 #include "power.h"
+static int nocompress = 0;
 static int noresume = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
@@ -50,18 +51,17 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
 /**
- * hibernation_set_ops - set the global hibernate operations
+ * hibernation_set_ops - Set the global hibernate operations.
- * @ops: the hibernation operations to use in subsequent hibernation transitions
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
 */
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
 {
        if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
            && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-            && ops->restore_cleanup)) {
+            && ops->restore_cleanup && ops->leave)) {
                WARN_ON(1);
                return;
        }
@@ -113,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
 /**
- *      platform_begin - tell the platform driver that we're starting
+ * platform_begin - Call platform to start hibernation.
- *      hibernation
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static int platform_begin(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
@@ -124,10 +123,9 @@ static int platform_begin(int platform_mode)
 }
 /**
- *      platform_end - tell the platform driver that we've entered the
+ * platform_end - Call platform to finish transition to the working state.
- *      working state
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static void platform_end(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -135,8 +133,11 @@ static void platform_end(int platform_mode)
 }
 /**
- *      platform_pre_snapshot - prepare the machine for hibernation using the
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
- *      platform driver if so configured and return an error code if it fails
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
 */
 static int platform_pre_snapshot(int platform_mode)
@@ -146,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
 }
 /**
- *      platform_leave - prepare the machine for switching to the normal mode
+ * platform_leave - Call platform to prepare a transition to the working state.
- *      of operation using the platform driver (called with interrupts disabled)
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
 */
 static void platform_leave(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -157,10 +162,14 @@ static void platform_leave(int platform_mode)
 }
 /**
- *      platform_finish - switch the machine to the normal mode of operation
+ * platform_finish - Call platform to switch the system to the working state.
- *      using the platform driver (must be called after platform_prepare())
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
 */
 static void platform_finish(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -168,11 +177,15 @@ static void platform_finish(int platform_mode)
 }
 /**
- *      platform_pre_restore - prepare the platform for the restoration from a
+ * platform_pre_restore - Prepare for hibernate image restoration.
- *      hibernation image.  If the restore fails after this function has been
+ * @platform_mode: Whether or not to use the platform driver.
- *      called, platform_restore_cleanup() must be called.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
 */
 static int platform_pre_restore(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
@@ -180,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
 }
 /**
- *      platform_restore_cleanup - switch the platform to the normal mode of
+ * platform_restore_cleanup - Switch to the working state after failing restore.
- *      operation after a failing restore.  If platform_pre_restore() has been
+ * @platform_mode: Whether or not to use the platform driver.
- *      called before the failing restore, this function must be called too,
+ *
- *      regardless of the result of platform_pre_restore().
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
 */
 static void platform_restore_cleanup(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -193,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
 }
 /**
- *      platform_recover - recover the platform from a failure to suspend
+ * platform_recover - Recover from a failure to suspend devices.
- *      devices.
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static void platform_recover(int platform_mode)
 {
        if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -204,13 +220,12 @@ static void platform_recover(int platform_mode)
 }
 /**
- *      swsusp_show_speed - print the time elapsed between two events.
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
- *      @start: Starting event.
+ * @start: Starting event.
- *      @stop: Final event.
+ * @stop: Final event.
- *      @nr_pages -     number of pages processed between @start and @stop
+ * @nr_pages: Number of memory pages processed between @start and @stop.
- *      @msg -          introductory message to print
+ * @msg: Additional diagnostic message to print.
 */
 void swsusp_show_speed(struct timeval *start, struct timeval *stop,
                        unsigned nr_pages, char *msg)
 {
@@ -233,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 }
 /**
- *      create_image - freeze devices that need to be frozen with interrupts
+ * create_image - Create a hibernation image.
- *      off, create the hibernation image and thaw those devices.  Control
+ * @platform_mode: Whether or not to use the platform driver.
- *      reappears in this routine after a restore.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * and execute the drivers' .thaw_noirq() callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
 */
 static int create_image(int platform_mode)
 {
        int error;
-        error = arch_prepare_suspend();
-        if (error)
-                return error;
-        /* At this point, dpm_suspend_start() has been called, but *not*
-         * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
-         * Otherwise, drivers for some devices (e.g. interrupt controllers)
-         * become desynchronized with the actual state of the hardware
-         * at resume time, and evil weirdness ensues.
-         */
        error = dpm_suspend_noirq(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -270,14 +278,14 @@ static int create_image(int platform_mode)
        local_irq_disable();
-        error = sysdev_suspend(PMSG_FREEZE);
+        error = syscore_suspend();
        if (error) {
                printk(KERN_ERR "PM: Some system devices failed to power down, "
                        "aborting hibernation\n");
                goto Enable_irqs;
        }
-        if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+        if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
                goto Power_up;
        in_suspend = 1;
@@ -294,10 +302,7 @@ static int create_image(int platform_mode)
        }
 Power_up:
-        sysdev_resume();
+        syscore_resume();
-        /* NOTE:  dpm_resume_noirq() is just a resume() for devices
-         * that suspended with irqs off ... no overall powerup.
-         */
 Enable_irqs:
        local_irq_enable();
@@ -315,31 +320,32 @@ static int create_image(int platform_mode)
 }
 /**
- *      hibernation_snapshot - quiesce devices and create the hibernation
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
- *      snapshot image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
- *      @platform_mode - if set, use the platform driver, if available, to
- *                       prepare the platform firmware for the power transition.
 *
- *      Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.
 */
 int hibernation_snapshot(int platform_mode)
 {
+        pm_message_t msg = PMSG_RECOVER;
        int error;
-        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
                goto Close;
+        error = dpm_prepare(PMSG_FREEZE);
+        if (error)
+                goto Complete_devices;
        /* Preallocate image memory before shutting down devices. */
        error = hibernate_preallocate_memory();
        if (error)
-                goto Close;
+                goto Complete_devices;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
-        error = dpm_suspend_start(PMSG_FREEZE);
+        error = dpm_suspend(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -347,17 +353,27 @@ int hibernation_snapshot(int platform_mode)
                goto Recover_platform;
        error = create_image(platform_mode);
-        /* Control returns here after successful restore */
+        /*
+         * Control returns here (1) after the image has been created or the
+         * image creation has failed and (2) after a successful restore.
+         */
 Resume_devices:
        /* We may need to release the preallocated image pages here. */
        if (error || !in_suspend)
                swsusp_free();
-        dpm_resume_end(in_suspend ?
+        msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
-                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        dpm_resume(msg);
-        set_gfp_allowed_mask(saved_mask);
+        if (error || !in_suspend)
+                pm_restore_gfp_mask();
        resume_console();
+ Complete_devices:
+        dpm_complete(msg);
 Close:
        platform_end(platform_mode);
        return error;
@@ -368,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
 }
 /**
- *      resume_target_kernel - prepare devices that need to be suspended with
+ * resume_target_kernel - Restore system state from a hibernation image.
- *      interrupts off, restore the contents of highmem that have not been
+ * @platform_mode: Whether or not to use the platform driver.
- *      restored yet from the image and run the low level code that will restore
+ *
- *      the remaining contents of memory and switch to the just restored target
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- *      kernel.
+ * highmem that have not been restored yet from the image and run the low-level
+ * code that will restore the remaining contents of memory and switch to the
+ * just restored target kernel.
 */
 static int resume_target_kernel(bool platform_mode)
 {
        int error;
@@ -396,34 +413,36 @@ static int resume_target_kernel(bool platform_mode)
        local_irq_disable();
-        error = sysdev_suspend(PMSG_QUIESCE);
+        error = syscore_suspend();
        if (error)
                goto Enable_irqs;
-        /* We'll ignore saved state, but this gets preempt count (etc) right */
        save_processor_state();
        error = restore_highmem();
        if (!error) {
                error = swsusp_arch_resume();
                /*
                 * The code below is only ever reached in case of a failure.
-                 * Otherwise execution continues at place where
+                 * Otherwise, execution continues at the place where
-                 * swsusp_arch_suspend() was called
+                 * swsusp_arch_suspend() was called.
                 */
                BUG_ON(!error);
-                /* This call to restore_highmem() undos the previous one */
+                /*
+                 * This call to restore_highmem() reverts the changes made by
+                 * the previous one.
+                 */
                restore_highmem();
        }
        /*
         * The only reason why swsusp_arch_resume() can fail is memory being
         * very tight, so we have to free it as soon as we can to avoid
-         * subsequent failures
+         * subsequent failures.
         */
        swsusp_free();
        restore_processor_state();
        touch_softlockup_watchdog();
-        sysdev_resume();
+        syscore_resume();
 Enable_irqs:
        local_irq_enable();
@@ -440,42 +459,36 @@ static int resume_target_kernel(bool platform_mode)
 }
 /**
- *      hibernation_restore - quiesce devices and restore the hibernation
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
- *      snapshot image.  If successful, control returns in hibernation_snaphot()
+ * @platform_mode: If set, use platform driver to prepare for the transition.
- *      @platform_mode - if set, use the platform driver, if available, to
- *                       prepare the platform firmware for the transition.
 *
- *      Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.  If it is successful, control
+ * reappears in the restored target kernel in hibernation_snaphot().
 */
 int hibernation_restore(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
        return error;
 }
 /**
- *      hibernation_platform_enter - enter the hibernation state using the
+ * hibernation_platform_enter - Power off the system using the platform driver.
- *      platform driver (if available)
 */
 int hibernation_platform_enter(void)
 {
        int error;
-        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -491,7 +504,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -512,8 +524,8 @@ int hibernation_platform_enter(void)
                goto Platform_finish;
        local_irq_disable();
-        sysdev_suspend(PMSG_HIBERNATE);
+        syscore_suspend();
-        if (!pm_check_wakeup_events()) {
+        if (pm_wakeup_pending()) {
                error = -EAGAIN;
                goto Power_up;
        }
@@ -523,7 +535,7 @@ int hibernation_platform_enter(void)
        while (1);
 Power_up:
-        sysdev_resume();
+        syscore_resume();
        local_irq_enable();
        enable_nonboot_cpus();
@@ -535,7 +547,6 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
@@ -545,12 +556,12 @@ int hibernation_platform_enter(void)
 }
 /**
- *      power_down - Shut the machine down for hibernation.
+ * power_down - Shut the machine down for hibernation.
 *
- *      Use the platform driver, if configured so; otherwise try
+ * Use the platform driver, if configured, to put the system into the sleep
- *      to power off or reboot.
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
 */
 static void power_down(void)
 {
        switch (hibernation_mode) {
@@ -587,9 +598,8 @@ static int prepare_processes(void)
 }
 /**
- *      hibernate - The granpappy of the built-in hibernation management
+ * hibernate - Carry out system hibernation, including saving the image.
 */
 int hibernate(void)
 {
        int error;
@@ -638,11 +648,15 @@ int hibernate(void)
                if (hibernation_mode == HIBERNATION_PLATFORM)
                        flags |= SF_PLATFORM_MODE;
+                if (nocompress)
+                        flags |= SF_NOCOMPRESS_MODE;
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
                if (!error)
                        power_down();
+                in_suspend = 0;
+                pm_restore_gfp_mask();
        } else {
                pr_debug("PM: Image restored successfully.\n");
        }
@@ -663,17 +677,20 @@ int hibernate(void)
 /**
- *      software_resume - Resume from a saved image.
+ * software_resume - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
 *
- *      Called as a late_initcall (so all devices are discovered and
+ * The image reading code is called to see if there is a hibernation image
- *      initialized), we call swsusp to see if we have a saved image or not.
+ * available for reading.  If that is the case, devices are quiesced and the
- *      If so, we quiesce devices, the restore the saved image. We will
+ * contents of memory is restored from the saved image.
- *      return above (in hibernate() ) if everything goes well.
- *      Otherwise, we fail gracefully and return to the normally
- *      scheduled program.
 *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate().  Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
 */
 static int software_resume(void)
 {
        int error;
@@ -705,7 +722,7 @@ static int software_resume(void)
                goto Unlock;
        }
-        pr_debug("PM: Checking image partition %s\n", resume_file);
+        pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
@@ -730,10 +747,10 @@ static int software_resume(void)
        }
 Check_image:
-        pr_debug("PM: Resume from partition %d:%d\n",
+        pr_debug("PM: Hibernation image partition %d:%d present\n",
                MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
-        pr_debug("PM: Checking hibernation image.\n");
+        pr_debug("PM: Looking for hibernation image.\n");
        error = swsusp_check();
        if (error)
                goto Unlock;
@@ -765,14 +782,14 @@ static int software_resume(void)
                goto Done;
        }
-        pr_debug("PM: Reading hibernation image.\n");
+        pr_debug("PM: Loading hibernation image.\n");
        error = swsusp_read(&flags);
        swsusp_close(FMODE_READ);
        if (!error)
                hibernation_restore(flags & SF_PLATFORM_MODE);
-        printk(KERN_ERR "PM: Restore failed, recovering.\n");
+        printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
        swsusp_free();
        thaw_processes();
 Done:
@@ -785,7 +802,7 @@ static int software_resume(void)
        /* For success case, the suspend path will release the lock */
 Unlock:
        mutex_unlock(&pm_mutex);
-        pr_debug("PM: Resume from disk failed.\n");
+        pr_debug("PM: Hibernation image not present or could not be loaded.\n");
        return error;
 close_finish:
        swsusp_close(FMODE_READ);
@@ -803,21 +820,17 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_TESTPROC]  = "testproc",
 };
-/**
+/*
- *      disk - Control hibernation mode
+ * /sys/power/disk - Control hibernation mode.
- *
- *      Suspend-to-disk can be handled in several ways. We have a few options
- *      for putting the system to sleep - using the platform driver (e.g. ACPI
- *      or other hibernation_ops), powering off the system or rebooting the
- *      system (for testing) as well as the two test modes.
 *
- *      The system can support 'platform', and that is known a priori (and
+ * Hibernation can be handled in several ways.  There are a few different ways
- *      encoded by the presence of hibernation_ops). However, the user may
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
- *      choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * or other hibernation_ops), powering it off or rebooting it (for testing
- *      test modes, 'test' or 'testproc'.
+ * mostly), or using one of the two available test modes.
 *
- *      show() will display what the mode is currently set to.
+ * The sysfs file /sys/power/disk provides an interface for selecting the
- *      store() will accept one of
+ * hibernation mode to use.  Reading from this file causes the available modes
+ * to be printed.  There are 5 modes that can be supported:
 *
 *      'platform'
 *      'shutdown'
@@ -825,8 +838,14 @@ static const char * const hibernation_modes[] = {
 *      'test'
 *      'testproc'
 *
- *      It will only change to 'platform' if the system
+ * If a platform hibernation driver is in use, 'platform' will be supported
- *      supports it (as determined by having hibernation_ops).
+ * and will be used by default.  Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
 */
 static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -859,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
        return buf-start;
 }
 static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t n)
 {
@@ -961,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
 power_attr(image_size);
+static ssize_t reserved_size_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%lu\n", reserved_size);
+}
+static ssize_t reserved_size_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t n)
+{
+        unsigned long size;
+        if (sscanf(buf, "%lu", &size) == 1) {
+                reserved_size = size;
+                return n;
+        }
+        return -EINVAL;
+}
+power_attr(reserved_size);
 static struct attribute * g[] = {
        &disk_attr.attr,
        &resume_attr.attr,
        &image_size_attr.attr,
+        &reserved_size_attr.attr,
        NULL,
 };
@@ -1004,6 +1045,15 @@ static int __init resume_offset_setup(char *str)
        return 1;
 }
+static int __init hibernate_setup(char *str)
+{
+        if (!strncmp(str, "noresume", 8))
+                noresume = 1;
+        else if (!strncmp(str, "nocompress", 10))
+                nocompress = 1;
+        return 1;
+}
 static int __init noresume_setup(char *str)
 {
        noresume = 1;
@@ -1013,3 +1063,4 @@ static int __init noresume_setup(char *str)
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
+__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 62b0bc6e4983..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
 DEFINE_MUTEX(pm_mutex);
-unsigned int pm_flags;
-EXPORT_SYMBOL(pm_flags);
 #ifdef CONFIG_PM_SLEEP
 /* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
 * writing to 'state'.  It first should read from 'wakeup_count' and store
 * the read value.  Then, after carrying out its own preparations for the system
 * transition to a sleep state, it should write the stored value to
- * 'wakeup_count'.  If that fails, at least one wakeup event has occured since
+ * 'wakeup_count'.  If that fails, at least one wakeup event has occurred since
 * 'wakeup_count' was read and 'state' should not be written to.  Otherwise, it
 * is allowed to write to 'state', but the transition will be aborted if there
 * are any wakeup events detected after 'wakeup_count' was written to.
@@ -237,18 +234,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                char *buf)
 {
-        unsigned long val;
+        unsigned int val;
-        return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+        return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
 }
 static ssize_t wakeup_count_store(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                const char *buf, size_t n)
 {
-        unsigned long val;
+        unsigned int val;
-        if (sscanf(buf, "%lu", &val) == 1) {
+        if (sscanf(buf, "%u", &val) == 1) {
                if (pm_save_wakeup_count(val))
                        return n;
        }
@@ -281,12 +278,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 power_attr(pm_trace);
+static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       char *buf)
+{
+        return show_trace_dev_match(buf, PAGE_SIZE);
+}
+static ssize_t
+pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
+                         const char *buf, size_t n)
+{
+        return -EINVAL;
+}
+power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
+        &pm_trace_dev_match_attr.attr,
 #endif
 #ifdef CONFIG_PM_SLEEP
        &pm_async_attr.attr,
@@ -308,7 +323,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
 static int __init pm_start_workqueue(void)
 {
-        pm_wq = create_freezeable_workqueue("pm");
+        pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
        return pm_wq ? 0 : -ENOMEM;
 }
@@ -321,6 +336,8 @@ static int __init pm_init(void)
        int error = pm_start_workqueue();
        if (error)
                return error;
+        hibernate_image_size_init();
+        hibernate_reserved_size_init();
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-struct nvs_page {
-        unsigned long phys_start;
-        unsigned int size;
-        void *kaddr;
-        void *data;
-        struct list_head node;
-};
-static LIST_HEAD(nvs_list);
-/**
- *      suspend_nvs_register - register platform NVS memory region to save
- *      @start - physical address of the region
- *      @size - size of the region
- *
- *      The NVS region need not be page-aligned (both ends) and we arrange
- *      things so that the data from page-aligned addresses in this region will
- *      be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
-        struct nvs_page *entry, *next;
-        while (size > 0) {
-                unsigned int nr_bytes;
-                entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-                if (!entry)
-                        goto Error;
-                list_add_tail(&entry->node, &nvs_list);
-                entry->phys_start = start;
-                nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-                entry->size = (size < nr_bytes) ? size : nr_bytes;
-                start += entry->size;
-                size -= entry->size;
-        }
-        return 0;
- Error:
-        list_for_each_entry_safe(entry, next, &nvs_list, node) {
-                list_del(&entry->node);
-                kfree(entry);
-        }
-        return -ENOMEM;
-}
-/**
- *      suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        free_page((unsigned long)entry->data);
-                        entry->data = NULL;
-                        if (entry->kaddr) {
-                                iounmap(entry->kaddr);
-                                entry->kaddr = NULL;
-                        }
-                }
-}
-/**
- *      suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node) {
-                entry->data = (void *)__get_free_page(GFP_KERNEL);
-                if (!entry->data) {
-                        suspend_nvs_free();
-                        return -ENOMEM;
-                }
-        }
-        return 0;
-}
-/**
- *      suspend_nvs_save - save NVS memory regions
- */
-void suspend_nvs_save(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Saving platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        entry->kaddr = ioremap(entry->phys_start, entry->size);
-                        memcpy(entry->data, entry->kaddr, entry->size);
-                }
-}
-/**
- *      suspend_nvs_restore - restore NVS memory regions
- *
- *      This function is going to be called with interrupts disabled, so it
- *      cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data)
-                        memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,10 @@ struct swsusp_info {
 } __attribute__((aligned(PAGE_SIZE)));
 #ifdef CONFIG_HIBERNATION
+/* kernel/power/snapshot.c */
+extern void __init hibernate_reserved_size_init(void);
+extern void __init hibernate_image_size_init(void);
 #ifdef CONFIG_ARCH_HIBERNATION_HEADER
 /* Maximum size of architecture specific data in a hibernation header */
 #define MAX_ARCH_HEADER_SIZE    (sizeof(struct new_utsname) + 4)
@@ -49,7 +53,12 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
-#endif
+#else /* !CONFIG_HIBERNATION */
+static inline void hibernate_reserved_size_init(void) {}
+static inline void hibernate_image_size_init(void) {}
+#endif /* !CONFIG_HIBERNATION */
 extern int pfn_is_nosave(unsigned long);
@@ -65,6 +74,8 @@ static struct kobj_attribute _name##_attr = {	\
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
+/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
+extern unsigned long reserved_size;
 extern int in_suspend;
 extern dev_t swsusp_resume_device;
 extern sector_t swsusp_resume_block;
@@ -134,6 +145,7 @@ extern int swsusp_swap_in_use(void);
 * the image header.
 */
 #define SF_PLATFORM_MODE        1
+#define SF_NOCOMPRESS_MODE      2
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 028a99598f49..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
 */
 #define TIMEOUT (20 * HZ)
-static inline int freezeable(struct task_struct * p)
+static inline int freezable(struct task_struct * p)
 {
        if ((p == current) ||
            (p->flags & PF_NOFREEZE) ||
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only)
        struct timeval start, end;
        u64 elapsed_csecs64;
        unsigned int elapsed_csecs;
+        bool wakeup = false;
        do_gettimeofday(&start);
@@ -52,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        if (frozen(p) || !freezeable(p))
+                        if (frozen(p) || !freezable(p))
                                continue;
                        if (!freeze_task(p, sig_only))
@@ -63,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
                         * perturb a task in TASK_STOPPED or TASK_TRACED.
                         * It is "frozen enough".  If the task does wake
                         * up, it will immediately call try_to_freeze.
+                         *
+                         * Because freeze_task() goes through p's
+                         * scheduler lock after setting TIF_FREEZE, it's
+                         * guaranteed that either we see TASK_RUNNING or
+                         * try_to_stop() after schedule() in ptrace/signal
+                         * stop sees TIF_FREEZE.
                         */
                        if (!task_is_stopped_or_traced(p) &&
                            !freezer_should_skip(p))
@@ -78,6 +85,11 @@ static int try_to_freeze_tasks(bool sig_only)
                if (!todo || time_after(jiffies, end_time))
                        break;
+                if (pm_wakeup_pending()) {
+                        wakeup = true;
+                        break;
+                }
                /*
                 * We need to retry, but first give the freezing tasks some
                 * time to enter the regrigerator.
@@ -97,8 +109,9 @@ static int try_to_freeze_tasks(bool sig_only)
                 * but it cleans up leftover PF_FREEZE requests.
                 */
                printk("\n");
-                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
+                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+                       wakeup ? "aborted" : "failed",
                       elapsed_csecs / 100, elapsed_csecs % 100,
                       todo - wq_busy, wq_busy);
@@ -107,7 +120,7 @@ static int try_to_freeze_tasks(bool sig_only)
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
-                        if (freezing(p) && !freezer_should_skip(p))
+                        if (!wakeup && freezing(p) && !freezer_should_skip(p))
                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
@@ -154,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
-                if (!freezeable(p))
+                if (!freezable(p))
                        continue;
                if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d3f795f01bbc..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,12 +41,29 @@ static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
 /*
+ * Number of bytes to reserve for memory allocations made by device drivers
+ * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
+ * cause image creation to fail (tunable via /sys/power/reserved_size).
+ */
+unsigned long reserved_size;
+void __init hibernate_reserved_size_init(void)
+{
+        reserved_size = SPARE_PAGES * PAGE_SIZE;
+}
+/*
 * Preferred image size in bytes (tunable via /sys/power/image_size).
 * When it is set to N, swsusp will do its best to ensure the image
 * size will not exceed N bytes, but if that is impossible, it will
 * try to create the smallest image possible.
 */
-unsigned long image_size = 500 * 1024 * 1024;
+unsigned long image_size;
+void __init hibernate_image_size_init(void)
+{
+        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+}
 /* List of PBEs needed for restoring the pages that were allocated before
 * the suspend and included in the suspend image, but have also been
@@ -979,8 +996,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                src = kmap_atomic(s_page, KM_USER0);
                dst = kmap_atomic(d_page, KM_USER1);
                do_copy_page(dst, src);
-                kunmap_atomic(src, KM_USER0);
                kunmap_atomic(dst, KM_USER1);
+                kunmap_atomic(src, KM_USER0);
        } else {
                if (PageHighMem(d_page)) {
                        /* Page pointed to by src may contain some kernel
@@ -988,7 +1005,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                         */
                        safe_copy_page(buffer, s_page);
                        dst = kmap_atomic(d_page, KM_USER0);
-                        memcpy(dst, buffer, PAGE_SIZE);
+                        copy_page(dst, buffer);
                        kunmap_atomic(dst, KM_USER0);
                } else {
                        safe_copy_page(page_address(d_page), s_page);
@@ -1194,7 +1211,11 @@ static void free_unnecessary_pages(void)
                to_free_highmem = alloc_highmem - save;
        } else {
                to_free_highmem = 0;
-                to_free_normal -= save - alloc_highmem;
+                save -= alloc_highmem;
+                if (to_free_normal > save)
+                        to_free_normal -= save;
+                else
+                        to_free_normal = 0;
        }
        memory_bm_position_reset(&copy_bm);
@@ -1258,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
 * frame in use.  We also need a number of page frames to be free during
 * hibernation for allocations made while saving the image and for device
 * drivers, in case they need to allocate memory from their hibernation
- * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
+ * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
- * respectively, both of which are rough estimates).  To make this happen, we
+ * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
- * compute the total number of available page frames and allocate at least
+ * /sys/power/reserved_size, respectively).  To make this happen, we compute the
+ * total number of available page frames and allocate at least
 *
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
+ * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
+ *  + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
 *
 * of them, which corresponds to the maximum size of a hibernation image.
 *
@@ -1317,13 +1340,16 @@ int hibernate_preallocate_memory(void)
        count -= totalreserve_pages;
        /* Compute the maximum number of saveable pages to leave in memory. */
-        max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+        max_size = (count - (size + PAGES_FOR_IO)) / 2
+                        - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
+        /* Compute the desired number of image pages specified by image_size. */
        size = DIV_ROUND_UP(image_size, PAGE_SIZE);
        if (size > max_size)
                size = max_size;
        /*
-         * If the maximum is not less than the current number of saveable pages
+         * If the desired number of image pages is at least as large as the
-         * in memory, allocate page frames for the image and we're done.
+         * current number of saveable pages in memory, allocate page frames for
+         * the image and we're done.
         */
        if (size >= saveable) {
                pages = preallocate_image_highmem(save_highmem);
@@ -1512,11 +1538,8 @@ static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
                unsigned int nr_pages, unsigned int nr_highmem)
 {
-        int error = 0;
        if (nr_highmem > 0) {
-                error = get_highmem_buffer(PG_ANY);
+                if (get_highmem_buffer(PG_ANY))
-                if (error)
                        goto err_out;
                if (nr_highmem > alloc_highmem) {
                        nr_highmem -= alloc_highmem;
@@ -1539,7 +1562,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 err_out:
        swsusp_free();
-        return error;
+        return -ENOMEM;
 }
 asmlinkage int swsusp_save(void)
@@ -1680,7 +1703,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
                memory_bm_position_reset(&orig_bm);
                memory_bm_position_reset(&copy_bm);
        } else if (handle->cur <= nr_meta_pages) {
-                memset(buffer, 0, PAGE_SIZE);
+                clear_page(buffer);
                pack_pfns(buffer, &orig_bm);
        } else {
                struct page *page;
@@ -1694,7 +1717,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
                        void *kaddr;
                        kaddr = kmap_atomic(page, KM_USER0);
-                        memcpy(buffer, kaddr, PAGE_SIZE);
+                        copy_page(buffer, kaddr);
                        kunmap_atomic(kaddr, KM_USER0);
                        handle->buffer = buffer;
                } else {
@@ -1977,7 +2000,7 @@ static void copy_last_highmem_page(void)
                void *dst;
                dst = kmap_atomic(last_highmem_page, KM_USER0);
-                memcpy(dst, buffer, PAGE_SIZE);
+                copy_page(dst, buffer);
                kunmap_atomic(dst, KM_USER0);
                last_highmem_page = NULL;
        }
@@ -2263,11 +2286,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
        kaddr1 = kmap_atomic(p1, KM_USER0);
        kaddr2 = kmap_atomic(p2, KM_USER1);
-        memcpy(buf, kaddr1, PAGE_SIZE);
+        copy_page(buf, kaddr1);
-        memcpy(kaddr1, kaddr2, PAGE_SIZE);
+        copy_page(kaddr1, kaddr2);
-        memcpy(kaddr2, buf, PAGE_SIZE);
+        copy_page(kaddr2, buf);
-        kunmap_atomic(kaddr1, KM_USER0);
        kunmap_atomic(kaddr2, KM_USER1);
+        kunmap_atomic(kaddr1, KM_USER0);
 }
 /**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,8 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
+#include <linux/syscore_ops.h>
+#include <trace/events/power.h>
 #include "power.h"
@@ -30,13 +32,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
        [PM_SUSPEND_MEM]        = "mem",
 };
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
 /**
 *      suspend_set_ops - Set the global suspend method table.
 *      @ops:   Pointer to ops structure.
 */
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
        mutex_lock(&pm_mutex);
        suspend_ops = ops;
@@ -161,13 +163,13 @@ static int suspend_enter(suspend_state_t state)
        arch_suspend_disable_irqs();
        BUG_ON(!irqs_disabled());
-        error = sysdev_suspend(PMSG_SUSPEND);
+        error = syscore_suspend();
        if (!error) {
-                if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
-                sysdev_resume();
+                syscore_resume();
        }
        arch_suspend_enable_irqs();
@@ -197,18 +199,17 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
-        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
+        trace_machine_suspend(state);
        if (suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
        }
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -219,17 +220,17 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (suspend_test(TEST_DEVICES))
                goto Recover_platform;
-        suspend_enter(state);
+        error = suspend_enter(state);
 Resume_devices:
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        if (suspend_ops->end)
                suspend_ops->end();
+        trace_machine_suspend(PWR_EVENT_EXIT);
        return error;
 Recover_platform:
@@ -285,7 +286,9 @@ int enter_state(suspend_state_t state)
                goto Finish;
        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+        pm_restrict_gfp_mask();
        error = suspend_devices_and_enter(state);
+        pm_restore_gfp_mask();
 Finish:
        pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e6a5bdf61a37..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
 *
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 *
@@ -24,10 +25,12 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/slab.h>
+#include <linux/lzo.h>
+#include <linux/vmalloc.h>
 #include "power.h"
-#define SWSUSP_SIG      "S1SUSPEND"
+#define HIBERNATE_SIG   "S1SUSPEND"
 /*
 *      The swap map is a data structure used for keeping track of each page
@@ -193,7 +196,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
        if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
                memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
-                memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
+                memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
                error = hib_bio_write_page(swsusp_resume_block,
@@ -221,7 +224,7 @@ static int swsusp_swap_check(void)
                return res;
        root_swap = res;
-        res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+        res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
        if (res)
                return res;
@@ -249,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
        if (bio_chain) {
                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
                if (src) {
-                        memcpy(src, buf, PAGE_SIZE);
+                        copy_page(src, buf);
                } else {
                        WARN_ON_ONCE(1);
                        bio_chain = NULL;       /* Go synchronous */
@@ -323,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                error = write_page(handle->cur, handle->cur_swap, NULL);
                if (error)
                        goto out;
-                memset(handle->cur, 0, PAGE_SIZE);
+                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
        }
@@ -357,6 +360,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
        return error;
 }
+/* We need to remember how much compressed data we need to read. */
+#define LZO_HEADER      sizeof(size_t)
+/* Number of pages/bytes we'll compress at one time. */
+#define LZO_UNC_PAGES   32
+#define LZO_UNC_SIZE    (LZO_UNC_PAGES * PAGE_SIZE)
+/* Number of pages/bytes we need for compressed data (worst case). */
+#define LZO_CMP_PAGES   DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
+                                     LZO_HEADER, PAGE_SIZE)
+#define LZO_CMP_SIZE    (LZO_CMP_PAGES * PAGE_SIZE)
 /**
 *      save_image - save the suspend image data
 */
@@ -404,6 +419,137 @@ static int save_image(struct swap_map_handle *handle,
        return ret;
 }
+/**
+ * save_image_lzo - Save the suspend image data compressed with LZO.
+ * @handle: Swap mam handle to use for saving the image.
+ * @snapshot: Image to read data from.
+ * @nr_to_write: Number of pages to save.
+ */
+static int save_image_lzo(struct swap_map_handle *handle,
+                          struct snapshot_handle *snapshot,
+                          unsigned int nr_to_write)
+{
+        unsigned int m;
+        int ret = 0;
+        int nr_pages;
+        int err2;
+        struct bio *bio;
+        struct timeval start;
+        struct timeval stop;
+        size_t off, unc_len, cmp_len;
+        unsigned char *unc, *cmp, *wrk, *page;
+        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                return -ENOMEM;
+        }
+        wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
+        if (!wrk) {
+                printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        unc = vmalloc(LZO_UNC_SIZE);
+        if (!unc) {
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                vfree(wrk);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        cmp = vmalloc(LZO_CMP_SIZE);
+        if (!cmp) {
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                vfree(unc);
+                vfree(wrk);
+                free_page((unsigned long)page);
+                return -ENOMEM;
+        }
+        printk(KERN_INFO
+                "PM: Compressing and saving image data (%u pages) ...     ",
+                nr_to_write);
+        m = nr_to_write / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        bio = NULL;
+        do_gettimeofday(&start);
+        for (;;) {
+                for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+                        ret = snapshot_read_next(snapshot);
+                        if (ret < 0)
+                                goto out_finish;
+                        if (!ret)
+                                break;
+                        memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+                        if (!(nr_pages % m))
+                                printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+                if (!off)
+                        break;
+                unc_len = off;
+                ret = lzo1x_1_compress(unc, unc_len,
+                                       cmp + LZO_HEADER, &cmp_len, wrk);
+                if (ret < 0) {
+                        printk(KERN_ERR "PM: LZO compression failed\n");
+                        break;
+                }
+                if (unlikely(!cmp_len ||
+                             cmp_len > lzo1x_worst_compress(unc_len))) {
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        ret = -1;
+                        break;
+                }
+                *(size_t *)cmp = cmp_len;
+                /*
+                 * Given we are writing one page at a time to disk, we copy
+                 * that much from the buffer, although the last bit will likely
+                 * be smaller than full page. This is OK - we saved the length
+                 * of the compressed data, so any garbage at the end will be
+                 * discarded when we read it.
+                 */
+                for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                        memcpy(page, cmp + off, PAGE_SIZE);
+                        ret = swap_write_page(handle, page, &bio);
+                        if (ret)
+                                goto out_finish;
+                }
+        }
+out_finish:
+        err2 = hib_wait_on_bio_chain(&bio);
+        do_gettimeofday(&stop);
+        if (!ret)
+                ret = err2;
+        if (!ret)
+                printk(KERN_CONT "\b\b\b\bdone\n");
+        else
+                printk(KERN_CONT "\n");
+        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+        vfree(cmp);
+        vfree(unc);
+        vfree(wrk);
+        free_page((unsigned long)page);
+        return ret;
+}
 /**
 *      enough_swap - Make sure we have enough swap to save the image.
 *
@@ -411,12 +557,16 @@ static int save_image(struct swap_map_handle *handle,
 *      space avaiable from the resume partition.
 */
-static int enough_swap(unsigned int nr_pages)
+static int enough_swap(unsigned int nr_pages, unsigned int flags)
 {
        unsigned int free_swap = count_swap_pages(root_swap, 1);
+        unsigned int required;
        pr_debug("PM: Free swap pages: %u\n", free_swap);
-        return free_swap > nr_pages + PAGES_FOR_IO;
+        required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
+                nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+        return free_swap > required;
 }
 /**
@@ -443,7 +593,7 @@ int swsusp_write(unsigned int flags)
                printk(KERN_ERR "PM: Cannot get swap writer\n");
                return error;
        }
-        if (!enough_swap(pages)) {
+        if (!enough_swap(pages, flags)) {
                printk(KERN_ERR "PM: Not enough free swap\n");
                error = -ENOSPC;
                goto out_finish;
@@ -458,8 +608,11 @@ int swsusp_write(unsigned int flags)
        }
        header = (struct swsusp_info *)data_of(snapshot);
        error = swap_write_page(&handle, header, NULL);
-        if (!error)
+        if (!error) {
-                error = save_image(&handle, &snapshot, pages - 1);
+                error = (flags & SF_NOCOMPRESS_MODE) ?
+                        save_image(&handle, &snapshot, pages - 1) :
+                        save_image_lzo(&handle, &snapshot, pages - 1);
+        }
 out_finish:
        error = swap_writer_finish(&handle, flags, error);
        return error;
@@ -590,9 +743,152 @@ static int load_image(struct swap_map_handle *handle,
 }
 /**
+ * load_image_lzo - Load compressed image data and decompress them with LZO.
+ * @handle: Swap map handle to use for loading data.
+ * @snapshot: Image to copy uncompressed data into.
+ * @nr_to_read: Number of pages to load.
+ */
+static int load_image_lzo(struct swap_map_handle *handle,
+                          struct snapshot_handle *snapshot,
+                          unsigned int nr_to_read)
+{
+        unsigned int m;
+        int error = 0;
+        struct bio *bio;
+        struct timeval start;
+        struct timeval stop;
+        unsigned nr_pages;
+        size_t i, off, unc_len, cmp_len;
+        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
+        for (i = 0; i < LZO_CMP_PAGES; i++) {
+                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+                if (!page[i]) {
+                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                        while (i)
+                                free_page((unsigned long)page[--i]);
+                        return -ENOMEM;
+                }
+        }
+        unc = vmalloc(LZO_UNC_SIZE);
+        if (!unc) {
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
+                return -ENOMEM;
+        }
+        cmp = vmalloc(LZO_CMP_SIZE);
+        if (!cmp) {
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                vfree(unc);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
+                return -ENOMEM;
+        }
+        printk(KERN_INFO
+                "PM: Loading and decompressing image data (%u pages) ...     ",
+                nr_to_read);
+        m = nr_to_read / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        bio = NULL;
+        do_gettimeofday(&start);
+        error = snapshot_write_next(snapshot);
+        if (error <= 0)
+                goto out_finish;
+        for (;;) {
+                error = swap_read_page(handle, page[0], NULL); /* sync */
+                if (error)
+                        break;
+                cmp_len = *(size_t *)page[0];
+                if (unlikely(!cmp_len ||
+                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        error = -1;
+                        break;
+                }
+                for (off = PAGE_SIZE, i = 1;
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                        error = swap_read_page(handle, page[i], &bio);
+                        if (error)
+                                goto out_finish;
+                }
+                error = hib_wait_on_bio_chain(&bio); /* need all data now */
+                if (error)
+                        goto out_finish;
+                for (off = 0, i = 0;
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                        memcpy(cmp + off, page[i], PAGE_SIZE);
+                }
+                unc_len = LZO_UNC_SIZE;
+                error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
+                                              unc, &unc_len);
+                if (error < 0) {
+                        printk(KERN_ERR "PM: LZO decompression failed\n");
+                        break;
+                }
+                if (unlikely(!unc_len ||
+                             unc_len > LZO_UNC_SIZE ||
+                             unc_len & (PAGE_SIZE - 1))) {
+                        printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
+                        error = -1;
+                        break;
+                }
+                for (off = 0; off < unc_len; off += PAGE_SIZE) {
+                        memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                        error = snapshot_write_next(snapshot);
+                        if (error <= 0)
+                                goto out_finish;
+                }
+        }
+out_finish:
+        do_gettimeofday(&stop);
+        if (!error) {
+                printk("\b\b\b\bdone\n");
+                snapshot_write_finalize(snapshot);
+                if (!snapshot_image_loaded(snapshot))
+                        error = -ENODATA;
+        } else
+                printk("\n");
+        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+        vfree(cmp);
+        vfree(unc);
+        for (i = 0; i < LZO_CMP_PAGES; i++)
+                free_page((unsigned long)page[i]);
+        return error;
+}
+/**
 *      swsusp_read - read the hibernation image.
 *      @flags_p: flags passed by the "frozen" kernel in the image header should
- *                be written into this memeory location
+ *                be written into this memory location
 */
 int swsusp_read(unsigned int *flags_p)
@@ -612,8 +908,11 @@ int swsusp_read(unsigned int *flags_p)
                goto end;
        if (!error)
                error = swap_read_page(&handle, header, NULL);
-        if (!error)
+        if (!error) {
-                error = load_image(&handle, &snapshot, header->pages - 1);
+                error = (*flags_p & SF_NOCOMPRESS_MODE) ?
+                        load_image(&handle, &snapshot, header->pages - 1) :
+                        load_image_lzo(&handle, &snapshot, header->pages - 1);
+        }
        swap_reader_finish(&handle);
 end:
        if (!error)
@@ -631,16 +930,17 @@ int swsusp_check(void)
 {
        int error;
-        hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+        hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+                                            FMODE_READ, NULL);
        if (!IS_ERR(hib_resume_bdev)) {
                set_blocksize(hib_resume_bdev, PAGE_SIZE);
-                memset(swsusp_header, 0, PAGE_SIZE);
+                clear_page(swsusp_header);
                error = hib_bio_read_page(swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
                        goto put;
-                if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
+                if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
                        memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
                        /* Reset swap signature now */
                        error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +953,13 @@ put:
                if (error)
                        blkdev_put(hib_resume_bdev, FMODE_READ);
                else
-                        pr_debug("PM: Signature found, resuming\n");
+                        pr_debug("PM: Image signature found, resuming\n");
        } else {
                error = PTR_ERR(hib_resume_bdev);
        }
        if (error)
-                pr_debug("PM: Error %d checking image file\n", error);
+                pr_debug("PM: Image not found (code %d)\n", error);
        return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                if (error)
                        pm_notifier_call_chain(PM_POST_RESTORE);
        }
-        if (error)
+        if (error) {
+                free_basic_memory_bitmaps();
                atomic_inc(&snapshot_device_available);
+        }
        data->frozen = 0;
        data->ready = 0;
        data->platform_support = 0;
@@ -135,9 +137,11 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_basic_memory_bitmaps();
        data = filp->private_data;
        free_all_swap_pages(data->swap);
-        if (data->frozen)
+        if (data->frozen) {
+                pm_restore_gfp_mask();
                thaw_processes();
-        pm_notifier_call_chain(data->mode == O_WRONLY ?
+        }
+        pm_notifier_call_chain(data->mode == O_RDONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
@@ -263,6 +267,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
+                pm_restore_gfp_mask();
                thaw_processes();
                usermodehelper_enable();
                data->frozen = 0;
@@ -275,6 +280,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = -EPERM;
                        break;
                }
+                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
                if (!error)
                        error = put_user(in_suspend, (int __user *)arg);
@@ -377,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                 * PM_HIBERNATION_PREPARE
                 */
                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+                data->ready = 0;
                break;
        case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk.c b/kernel/printk.c
index 9dc8ea140426..b799a2ee96e5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
 #include <linux/smp.h>
 #include <linux/security.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
@@ -39,16 +40,11 @@
 #include <linux/syslog.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/rculist.h>
 #include <asm/uaccess.h>
 /*
- * for_each_console() allows you to iterate on each console
- */
-#define for_each_console(con) \
-        for (con = console_drivers; con != NULL; con = con->next)
-/*
 * Architectures can override it:
 */
 void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -58,7 +54,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define __LOG_BUF_LEN   (1 << CONFIG_LOG_BUF_SHIFT)
 /* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 /* We show everything that is MORE important than this.. */
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -92,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress);
 * provides serialisation for access to the entire console
 * driver system.
 */
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
@@ -109,7 +105,7 @@ static int console_locked, console_suspended;
 /*
 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
 * It is also used in interesting ways to provide interlocking in
- * release_console_sem().
+ * console_unlock();.
 */
 static DEFINE_SPINLOCK(logbuf_lock);
@@ -125,6 +121,11 @@ static unsigned con_start;	/* Index into log_buf: next char to be sent to consol
 static unsigned log_end;        /* Index into log_buf: most-recently-written-char + 1 */
 /*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+/*
 *      Array of consoles built from command line options (console=)
 */
 struct console_cmdline
@@ -174,50 +175,78 @@ void log_buf_kexec_setup(void)
 }
 #endif
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+/* save requested log_buf_len since it's too early to process it */
 static int __init log_buf_len_setup(char *str)
 {
        unsigned size = memparse(str, &str);
-        unsigned long flags;
        if (size)
                size = roundup_pow_of_two(size);
-        if (size > log_buf_len) {
+        if (size > log_buf_len)
-                unsigned start, dest_idx, offset;
+                new_log_buf_len = size;
-                char *new_log_buf;
-                new_log_buf = alloc_bootmem(size);
+        return 0;
-                if (!new_log_buf) {
+}
-                        printk(KERN_WARNING "log_buf_len: allocation failed\n");
+early_param("log_buf_len", log_buf_len_setup);
-                        goto out;
-                }
-                spin_lock_irqsave(&logbuf_lock, flags);
+void __init setup_log_buf(int early)
-                log_buf_len = size;
+{
-                log_buf = new_log_buf;
+        unsigned long flags;
+        unsigned start, dest_idx, offset;
-                offset = start = min(con_start, log_start);
+        char *new_log_buf;
-                dest_idx = 0;
+        int free;
-                while (start != log_end) {
-                        log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
+        if (!new_log_buf_len)
-                        start++;
+                return;
-                        dest_idx++;
-                }
+        if (early) {
-                log_start -= offset;
+                unsigned long mem;
-                con_start -= offset;
-                log_end -= offset;
-                spin_unlock_irqrestore(&logbuf_lock, flags);
-                printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+                if (mem == MEMBLOCK_ERROR)
+                        return;
+                new_log_buf = __va(mem);
+        } else {
+                new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
        }
-out:
-        return 1;
-}
-__setup("log_buf_len=", log_buf_len_setup);
+        if (unlikely(!new_log_buf)) {
+                pr_err("log_buf_len: %ld bytes not available\n",
+                        new_log_buf_len);
+                return;
+        }
+        spin_lock_irqsave(&logbuf_lock, flags);
+        log_buf_len = new_log_buf_len;
+        log_buf = new_log_buf;
+        new_log_buf_len = 0;
+        free = __LOG_BUF_LEN - log_end;
+        offset = start = min(con_start, log_start);
+        dest_idx = 0;
+        while (start != log_end) {
+                unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+                log_buf[dest_idx] = __log_buf[log_idx_mask];
+                start++;
+                dest_idx++;
+        }
+        log_start -= offset;
+        con_start -= offset;
+        log_end -= offset;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
+        pr_info("log_buf_len: %d\n", log_buf_len);
+        pr_info("early log buf free: %d(%d%%)\n",
+                free, (free * 100) / __LOG_BUF_LEN);
+}
 #ifdef CONFIG_BOOT_PRINTK_DELAY
-static unsigned int boot_delay; /* msecs delay after each printk during bootup */
+static int boot_delay; /* msecs delay after each printk during bootup */
 static unsigned long long loops_per_msec;       /* based on boot_delay */
 static int __init boot_delay_setup(char *str)
@@ -268,14 +297,55 @@ static inline void boot_delay_msec(void)
 }
 #endif
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+static int syslog_action_restricted(int type)
+{
+        if (dmesg_restrict)
+                return 1;
+        /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
+        return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+static int check_syslog_permissions(int type, bool from_file)
+{
+        /*
+         * If this is from /proc/kmsg and we've already opened it, then we've
+         * already done the capabilities checks at open time.
+         */
+        if (from_file && type != SYSLOG_ACTION_OPEN)
+                return 0;
+        if (syslog_action_restricted(type)) {
+                if (capable(CAP_SYSLOG))
+                        return 0;
+                /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
+                if (capable(CAP_SYS_ADMIN)) {
+                        WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                                 "but no CAP_SYSLOG (deprecated).\n");
+                        return 0;
+                }
+                return -EPERM;
+        }
+        return 0;
+}
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
-        int error = 0;
+        int error;
+        error = check_syslog_permissions(type, from_file);
+        if (error)
+                goto out;
-        error = security_syslog(type, from_file);
+        error = security_syslog(type);
        if (error)
                return error;
@@ -447,6 +517,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
        struct console *con;
        for_each_console(con) {
+                if (exclusive_console && con != exclusive_console)
+                        continue;
                if ((con->flags & CON_ENABLED) && con->write &&
                                (cpu_online(smp_processor_id()) ||
                                (con->flags & CON_ANYTIME)))
@@ -486,9 +558,74 @@ static void _call_console_drivers(unsigned start,
 }
 /*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+        unsigned int lev = 0;
+        char sp = '\0';
+        size_t len;
+        if (p[0] != '<' || !p[1])
+                return 0;
+        if (p[2] == '>') {
+                /* usual single digit level number or special char */
+                switch (p[1]) {
+                case '0' ... '7':
+                        lev = p[1] - '0';
+                        break;
+                case 'c': /* KERN_CONT */
+                case 'd': /* KERN_DEFAULT */
+                        sp = p[1];
+                        break;
+                default:
+                        return 0;
+                }
+                len = 3;
+        } else {
+                /* multi digit including the level and facility number */
+                char *endp = NULL;
+                if (p[1] < '0' && p[1] > '9')
+                        return 0;
+                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+                if (endp == NULL || endp[0] != '>')
+                        return 0;
+                len = (endp + 1) - p;
+        }
+        /* do not accept special char if not asked for */
+        if (sp && !special)
+                return 0;
+        if (special) {
+                *special = sp;
+                /* return special char, do not touch level */
+                if (sp)
+                        return len;
+        }
+        if (level)
+                *level = lev;
+        return len;
+}
+/*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
- * The console_sem must be held.
+ * The console_lock must be held.
 */
 static void call_console_drivers(unsigned start, unsigned end)
 {
@@ -500,13 +637,9 @@ static void call_console_drivers(unsigned start, unsigned end)
        cur_index = start;
        start_print = start;
        while (cur_index != end) {
-                if (msg_level < 0 && ((end - cur_index) > 2) &&
+                if (msg_level < 0 && ((end - cur_index) > 2)) {
-                                LOG_BUF(cur_index + 0) == '<' &&
+                        /* strip log prefix */
-                                LOG_BUF(cur_index + 1) >= '0' &&
+                        cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
-                                LOG_BUF(cur_index + 1) <= '7' &&
-                                LOG_BUF(cur_index + 2) == '>') {
-                        msg_level = LOG_BUF(cur_index + 1) - '0';
-                        cur_index += 3;
                        start_print = cur_index;
                }
                while (cur_index != end) {
@@ -563,7 +696,7 @@ static void zap_locks(void)
        /* If a crash is occurring, make sure we can't deadlock */
        spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
-        init_MUTEX(&console_sem);
+        sema_init(&console_sem, 1);
 }
 #if defined(CONFIG_PRINTK_TIME)
@@ -591,11 +724,11 @@ static int have_callable_console(void)
 *
 * This is printk().  It can be called from any context.  We want it to work.
 *
- * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
+ * We try to grab the console_lock.  If we succeed, it's easy - we log the output and
 * call the console drivers.  If we fail to get the semaphore we place the output
 * into the log buffer and return.  The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
+ * notice the new output in console_unlock(); and will send it to the
- * consoles before releasing the semaphore.
+ * consoles before releasing the lock.
 *
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
@@ -646,18 +779,19 @@ static inline int can_use_console(unsigned int cpu)
 /*
 * Try to get console ownership to actually show the kernel
 * messages from a 'printk'. Return true (and with the
- * console_semaphore held, and 'console_locked' set) if it
+ * console_lock held, and 'console_locked' set) if it
 * is successful, false otherwise.
 *
 * This gets called with the 'logbuf_lock' spinlock held and
 * interrupts disabled. It should return with 'lockbuf_lock'
 * released but interrupts still disabled.
 */
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu)
+        __releases(&logbuf_lock)
 {
        int retval = 0;
-        if (!try_acquire_console_sem()) {
+        if (console_trylock()) {
                retval = 1;
                /*
@@ -703,6 +837,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        unsigned long flags;
        int this_cpu;
        char *p;
+        size_t plen;
+        char special;
        boot_delay_msec();
        printk_delay();
@@ -746,45 +882,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        if (trace_override && !trace_recurse)
                TRACE("%s", printk_buf);
        p = printk_buf;
-        /* Do we have a loglevel in the string? */
+        /* Read log level and handle special printk prefix */
-        if (p[0] == '<') {
+        plen = log_prefix(p, &current_log_level, &special);
-                unsigned char c = p[1];
+        if (plen) {
-                if (c && p[2] == '>') {
+                p += plen;
-                        switch (c) {
-                        case '0' ... '7': /* loglevel */
+                switch (special) {
-                                current_log_level = c - '0';
+                case 'c': /* Strip <c> KERN_CONT, continue line */
-                        /* Fallthrough - make sure we're on a new line */
+                        plen = 0;
-                        case 'd': /* KERN_DEFAULT */
+                        break;
-                                if (!new_text_line) {
+                case 'd': /* Strip <d> KERN_DEFAULT, start new line */
-                                        emit_log_char('\n');
+                        plen = 0;
-                                        new_text_line = 1;
+                default:
-                                }
+                        if (!new_text_line) {
-                        /* Fallthrough - skip the loglevel */
+                                emit_log_char('\n');
-                        case 'c': /* KERN_CONT */
+                                new_text_line = 1;
-                                p += 3;
-                                break;
                        }
                }
        }
        /*
-         * Copy the output into log_buf.  If the caller didn't provide
+         * Copy the output into log_buf. If the caller didn't provide
-         * appropriate log level tags, we insert them here
+         * the appropriate log prefix, we insert them here
         */
-        for ( ; *p; p++) {
+        for (; *p; p++) {
                if (new_text_line) {
-                        /* Always output the token */
-                        emit_log_char('<');
-                        emit_log_char(current_log_level + '0');
-                        emit_log_char('>');
-                        printed_len += 3;
                        new_text_line = 0;
+                        if (plen) {
+                                /* Copy original log prefix */
+                                int i;
+                                for (i = 0; i < plen; i++)
+                                        emit_log_char(printk_buf[i]);
+                                printed_len += plen;
+                        } else {
+                                /* Add log prefix */
+                                emit_log_char('<');
+                                emit_log_char(current_log_level + '0');
+                                emit_log_char('>');
+                                printed_len += 3;
+                        }
                        if (printk_time) {
-                                /* Follow the token with the time */
+                                /* Add the current time stamp */
                                char tbuf[50], *tp;
                                unsigned tlen;
                                unsigned long long t;
@@ -816,12 +959,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
         * actual magic (print out buffers, wake up klogd,
         * etc).
         *
-         * The acquire_console_semaphore_for_printk() function
+         * The console_trylock_for_printk() function
         * will release 'logbuf_lock' regardless of whether it
         * actually gets the semaphore or not.
         */
-        if (acquire_console_semaphore_for_printk(this_cpu))
+        if (console_trylock_for_printk(this_cpu))
-                release_console_sem();
+                console_unlock();
        lockdep_on();
 out_restore_irqs:
@@ -982,7 +1125,7 @@ void suspend_console(void)
        if (!console_suspend_enabled)
                return;
        printk("Suspending console(s) (use no_console_suspend to debug)\n");
-        acquire_console_sem();
+        console_lock();
        console_suspended = 1;
        up(&console_sem);
 }
@@ -993,7 +1136,7 @@ void resume_console(void)
                return;
        down(&console_sem);
        console_suspended = 0;
-        release_console_sem();
+        console_unlock();
 }
 /**
@@ -1016,21 +1159,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
        case CPU_DYING:
        case CPU_DOWN_FAILED:
        case CPU_UP_CANCELED:
-                acquire_console_sem();
+                console_lock();
-                release_console_sem();
+                console_unlock();
        }
        return NOTIFY_OK;
 }
 /**
- * acquire_console_sem - lock the console system for exclusive use.
+ * console_lock - lock the console system for exclusive use.
 *
- * Acquires a semaphore which guarantees that the caller has
+ * Acquires a lock which guarantees that the caller has
 * exclusive access to the console system and the console_drivers list.
 *
 * Can sleep, returns nothing.
 */
-void acquire_console_sem(void)
+void console_lock(void)
 {
        BUG_ON(in_interrupt());
        down(&console_sem);
@@ -1039,21 +1182,29 @@ void acquire_console_sem(void)
        console_locked = 1;
        console_may_schedule = 1;
 }
-EXPORT_SYMBOL(acquire_console_sem);
+EXPORT_SYMBOL(console_lock);
-int try_acquire_console_sem(void)
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
 {
        if (down_trylock(&console_sem))
-                return -1;
+                return 0;
        if (console_suspended) {
                up(&console_sem);
-                return -1;
+                return 0;
        }
        console_locked = 1;
        console_may_schedule = 0;
-        return 0;
+        return 1;
 }
-EXPORT_SYMBOL(try_acquire_console_sem);
+EXPORT_SYMBOL(console_trylock);
 int is_console_locked(void)
 {
@@ -1064,38 +1215,40 @@ static DEFINE_PER_CPU(int, printk_pending);
 void printk_tick(void)
 {
-        if (__get_cpu_var(printk_pending)) {
+        if (__this_cpu_read(printk_pending)) {
-                __get_cpu_var(printk_pending) = 0;
+                __this_cpu_write(printk_pending, 0);
                wake_up_interruptible(&log_wait);
        }
 }
 int printk_needs_cpu(int cpu)
 {
-        return per_cpu(printk_pending, cpu);
+        if (cpu_is_offline(cpu))
+                printk_tick();
+        return __this_cpu_read(printk_pending);
 }
 void wake_up_klogd(void)
 {
        if (!trace_override && waitqueue_active(&log_wait))
-                __raw_get_cpu_var(printk_pending) = 1;
+                this_cpu_write(printk_pending, 1);
 }
 /**
- * release_console_sem - unlock the console system
+ * console_unlock - unlock the console system
 *
- * Releases the semaphore which the caller holds on the console system
+ * Releases the console_lock which the caller holds on the console system
 * and the console driver list.
 *
- * While the semaphore was held, console output may have been buffered
+ * While the console_lock was held, console output may have been buffered
- * by printk().  If this is the case, release_console_sem() emits
+ * by printk().  If this is the case, console_unlock(); emits
- * the output prior to releasing the semaphore.
+ * the output prior to releasing the lock.
 *
 * If there is output waiting for klogd, we wake it up.
 *
- * release_console_sem() may be called from any context.
+ * console_unlock(); may be called from any context.
 */
-void release_console_sem(void)
+void console_unlock(void)
 {
        unsigned long flags;
        unsigned _con_start, _log_end;
@@ -1123,12 +1276,17 @@ void release_console_sem(void)
                local_irq_restore(flags);
        }
        console_locked = 0;
+        /* Release the exclusive_console once it is used */
+        if (unlikely(exclusive_console))
+                exclusive_console = NULL;
        up(&console_sem);
        spin_unlock_irqrestore(&logbuf_lock, flags);
        if (wake_klogd)
                wake_up_klogd();
 }
-EXPORT_SYMBOL(release_console_sem);
+EXPORT_SYMBOL(console_unlock);
 /**
 * console_conditional_schedule - yield the CPU if required
@@ -1137,7 +1295,7 @@ EXPORT_SYMBOL(release_console_sem);
 * if this CPU should yield the CPU to another task, do
 * so here.
 *
- * Must be called within acquire_console_sem().
+ * Must be called within console_lock();.
 */
 void __sched console_conditional_schedule(void)
 {
@@ -1158,14 +1316,14 @@ void console_unblank(void)
                if (down_trylock(&console_sem) != 0)
                        return;
        } else
-                acquire_console_sem();
+                console_lock();
        console_locked = 1;
        console_may_schedule = 0;
        for_each_console(c)
                if ((c->flags & CON_ENABLED) && c->unblank)
                        c->unblank();
-        release_console_sem();
+        console_unlock();
 }
 /*
@@ -1176,7 +1334,7 @@ struct tty_driver *console_device(int *index)
        struct console *c;
        struct tty_driver *driver = NULL;
-        acquire_console_sem();
+        console_lock();
        for_each_console(c) {
                if (!c->device)
                        continue;
@@ -1184,7 +1342,7 @@ struct tty_driver *console_device(int *index)
                if (driver)
                        break;
        }
-        release_console_sem();
+        console_unlock();
        return driver;
 }
@@ -1195,20 +1353,32 @@ struct tty_driver *console_device(int *index)
 */
 void console_stop(struct console *console)
 {
-        acquire_console_sem();
+        console_lock();
        console->flags &= ~CON_ENABLED;
-        release_console_sem();
+        console_unlock();
 }
 EXPORT_SYMBOL(console_stop);
 void console_start(struct console *console)
 {
-        acquire_console_sem();
+        console_lock();
        console->flags |= CON_ENABLED;
-        release_console_sem();
+        console_unlock();
 }
 EXPORT_SYMBOL(console_start);
+static int __read_mostly keep_bootcon;
+static int __init keep_bootcon_setup(char *str)
+{
+        keep_bootcon = 1;
+        printk(KERN_INFO "debug: skip boot console de-registration.\n");
+        return 0;
+}
+early_param("keep_bootcon", keep_bootcon_setup);
 /*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
@@ -1327,7 +1497,7 @@ void register_console(struct console *newcon)
         *      Put this console in the list - keep the
         *      preferred driver at the head of the list.
         */
-        acquire_console_sem();
+        console_lock();
        if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
                newcon->next = console_drivers;
                console_drivers = newcon;
@@ -1339,14 +1509,21 @@ void register_console(struct console *newcon)
        }
        if (newcon->flags & CON_PRINTBUFFER) {
                /*
-                 * release_console_sem() will print out the buffered messages
+                 * console_unlock(); will print out the buffered messages
                 * for us.
                 */
                spin_lock_irqsave(&logbuf_lock, flags);
                con_start = log_start;
                spin_unlock_irqrestore(&logbuf_lock, flags);
+                /*
+                 * We're about to replay the log buffer.  Only do this to the
+                 * just-registered console to avoid excessive message spam to
+                 * the already-registered consoles.
+                 */
+                exclusive_console = newcon;
        }
-        release_console_sem();
+        console_unlock();
+        console_sysfs_notify();
        /*
         * By unregistering the bootconsoles after we enable the real console
@@ -1355,7 +1532,9 @@ void register_console(struct console *newcon)
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
-        if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+        if (bcon &&
+            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+            !keep_bootcon) {
                /* we need to iterate through twice, to make sure we print
                 * everything out, before we unregister the console(s)
                 */
@@ -1382,7 +1561,7 @@ int unregister_console(struct console *console)
                return braille_unregister_console(console);
 #endif
-        acquire_console_sem();
+        console_lock();
        if (console_drivers == console) {
                console_drivers=console->next;
                res = 0;
@@ -1404,7 +1583,8 @@ int unregister_console(struct console *console)
        if (console_drivers != NULL && console->flags & CON_CONSDEV)
                console_drivers->flags |= CON_CONSDEV;
-        release_console_sem();
+        console_unlock();
+        console_sysfs_notify();
        return res;
 }
 EXPORT_SYMBOL(unregister_console);
@@ -1488,7 +1668,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
-                list_add_tail(&dumper->list, &dump_list);
+                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1512,29 +1692,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
-                list_del(&dumper->list);
+                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
+        synchronize_rcu();
        return err;
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char const *kmsg_reasons[] = {
-        [KMSG_DUMP_OOPS]        = "oops",
-        [KMSG_DUMP_PANIC]       = "panic",
-        [KMSG_DUMP_KEXEC]       = "kexec",
-};
-static const char *kmsg_to_str(enum kmsg_dump_reason reason)
-{
-        if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
-                return "unknown";
-        return kmsg_reasons[reason];
-}
 /**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
@@ -1573,13 +1740,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
                l2 = chars;
        }
-        if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
+        rcu_read_lock();
-                printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
+        list_for_each_entry_rcu(dumper, &dump_list, list)
-                                kmsg_to_str(reason));
-                return;
-        }
-        list_for_each_entry(dumper, &dump_list, list)
                dumper->dump(dumper, reason, s1, l1, s2, l2);
-        spin_unlock_irqrestore(&dump_list_lock, flags);
+        rcu_read_unlock();
 }
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
        if (prof_buffer)
                return 0;
-        prof_buffer = vmalloc(buffer_bytes);
+        prof_buffer = vzalloc(buffer_bytes);
-        if (prof_buffer) {
+        if (prof_buffer)
-                memset(prof_buffer, 0, buffer_bytes);
                return 0;
-        }
        free_cpumask_var(prof_cpu_mask);
        return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
        mutex_unlock(&profile_flip_mutex);
 }
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
        int i, j, cpu;
        struct profile_hit *hits;
-        if (prof_on != type || !prof_buffer)
-                return;
        pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
        i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
        secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
 #define profile_discard_flip_buffers()  do { } while (0)
 #define profile_cpu_callback            NULL
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long pc;
-        if (prof_on != type || !prof_buffer)
-                return;
        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
        atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+        if (prof_on != type || !prof_buffer)
+                return;
+        do_profile_hits(type, __pc, nr_hits);
+}
 EXPORT_SYMBOL_GPL(profile_hits);
 void profile_tick(int type)
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 static const struct file_operations proc_profile_operations = {
        .read           = read_profile,
        .write          = write_profile,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..2df115790cd9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/regset.h>
+#include <linux/hw_breakpoint.h>
 /*
@@ -37,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
        child->parent = new_parent;
 }
-/*
+/**
- * Turn a tracing stop into a normal stop now, since with no tracer there
+ * __ptrace_unlink - unlink ptracee and restore its execution state
- * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
+ * @child: ptracee to be unlinked
- * signal sent that would resume the child, but didn't because it was in
- * TASK_TRACED, resume it now.
- * Requires that irqs be disabled.
- */
-static void ptrace_untrace(struct task_struct *child)
-{
-        spin_lock(&child->sighand->siglock);
-        if (task_is_traced(child)) {
-                /*
-                 * If the group stop is completed or in progress,
-                 * this thread was already counted as stopped.
-                 */
-                if (child->signal->flags & SIGNAL_STOP_STOPPED ||
-                    child->signal->group_stop_count)
-                        __set_task_state(child, TASK_STOPPED);
-                else
-                        signal_wake_up(child, 1);
-        }
-        spin_unlock(&child->sighand->siglock);
-}
-/*
- * unptrace a task: move it back to its original parent and
- * remove it from the ptrace list.
 *
- * Must be called with the tasklist lock write-held.
+ * Remove @child from the ptrace list, move it back to the original parent,
+ * and restore the execution state so that it conforms to the group stop
+ * state.
+ *
+ * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
+ * exiting.  For PTRACE_DETACH, unless the ptracee has been killed between
+ * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
+ * If the ptracer is exiting, the ptracee can be in any state.
+ *
+ * After detach, the ptracee should be in a state which conforms to the
+ * group stop.  If the group is stopped or in the process of stopping, the
+ * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
+ * up from TASK_TRACED.
+ *
+ * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
+ * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
+ * to but in the opposite direction of what happens while attaching to a
+ * stopped task.  However, in this direction, the intermediate RUNNING
+ * state is not hidden even from the current ptracer and if it immediately
+ * re-attaches and performs a WNOHANG wait(2), it may fail.
+ *
+ * CONTEXT:
+ * write_lock_irq(tasklist_lock)
 */
 void __ptrace_unlink(struct task_struct *child)
 {
@@ -75,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child)
        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
-        if (task_is_traced(child))
+        spin_lock(&child->sighand->siglock);
-                ptrace_untrace(child);
+        /*
+         * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+         * @child isn't dead.
+         */
+        if (!(child->flags & PF_EXITING) &&
+            (child->signal->flags & SIGNAL_STOP_STOPPED ||
+             child->signal->group_stop_count))
+                child->group_stop |= GROUP_STOP_PENDING;
+        /*
+         * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
+         * @child in the butt.  Note that @resume should be used iff @child
+         * is in TASK_TRACED; otherwise, we might unduly disrupt
+         * TASK_KILLABLE sleeps.
+         */
+        if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+                signal_wake_up(child, task_is_traced(child));
+        spin_unlock(&child->sighand->siglock);
 }
 /*
@@ -95,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
         */
        read_lock(&tasklist_lock);
        if ((child->ptrace & PT_PTRACED) && child->parent == current) {
-                ret = 0;
                /*
                 * child->sighand can't be NULL, release_task()
                 * does ptrace_unlink() before __exit_signal().
                 */
                spin_lock_irq(&child->sighand->siglock);
-                if (task_is_stopped(child))
+                WARN_ON_ONCE(task_is_stopped(child));
-                        child->state = TASK_TRACED;
+                if (task_is_traced(child) || kill)
-                else if (!task_is_traced(child) && !kill)
+                        ret = 0;
-                        ret = -ESRCH;
                spin_unlock_irq(&child->sighand->siglock);
        }
        read_unlock(&tasklist_lock);
@@ -134,21 +150,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
                return 0;
        rcu_read_lock();
        tcred = __task_cred(task);
-        if ((cred->uid != tcred->euid ||
+        if (cred->user->user_ns == tcred->user->user_ns &&
-             cred->uid != tcred->suid ||
+            (cred->uid == tcred->euid &&
-             cred->uid != tcred->uid  ||
+             cred->uid == tcred->suid &&
-             cred->gid != tcred->egid ||
+             cred->uid == tcred->uid  &&
-             cred->gid != tcred->sgid ||
+             cred->gid == tcred->egid &&
-             cred->gid != tcred->gid) &&
+             cred->gid == tcred->sgid &&
-            !capable(CAP_SYS_PTRACE)) {
+             cred->gid == tcred->gid))
-                rcu_read_unlock();
+                goto ok;
-                return -EPERM;
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
-        }
+                goto ok;
+        rcu_read_unlock();
+        return -EPERM;
+ok:
        rcu_read_unlock();
        smp_rmb();
        if (task->mm)
                dumpable = get_dumpable(task->mm);
-        if (!dumpable && !capable(CAP_SYS_PTRACE))
+        if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
                return -EPERM;
        return security_ptrace_access_check(task, mode);
@@ -163,8 +182,9 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
        return !err;
 }
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
 {
+        bool wait_trap = false;
        int retval;
        audit_ptrace(task);
@@ -181,7 +201,7 @@ int ptrace_attach(struct task_struct *task)
         * under ptrace.
         */
        retval = -ERESTARTNOINTR;
-        if (mutex_lock_interruptible(&task->cred_guard_mutex))
+        if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
                goto out;
        task_lock(task);
@@ -198,18 +218,48 @@ int ptrace_attach(struct task_struct *task)
                goto unlock_tasklist;
        task->ptrace = PT_PTRACED;
-        if (capable(CAP_SYS_PTRACE))
+        if (task_ns_capable(task, CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
        send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+        spin_lock(&task->sighand->siglock);
+        /*
+         * If the task is already STOPPED, set GROUP_STOP_PENDING and
+         * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
+         * will be cleared if the child completes the transition or any
+         * event which clears the group stop states happens.  We'll wait
+         * for the transition to complete before returning from this
+         * function.
+         *
+         * This hides STOPPED -> RUNNING -> TRACED transition from the
+         * attaching thread but a different thread in the same group can
+         * still observe the transient RUNNING state.  IOW, if another
+         * thread's WNOHANG wait(2) on the stopped tracee races against
+         * ATTACH, the wait(2) may fail due to the transient RUNNING.
+         *
+         * The following task_is_stopped() test is safe as both transitions
+         * in and out of STOPPED are protected by siglock.
+         */
+        if (task_is_stopped(task)) {
+                task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+                signal_wake_up(task, 1);
+                wait_trap = true;
+        }
+        spin_unlock(&task->sighand->siglock);
        retval = 0;
 unlock_tasklist:
        write_unlock_irq(&tasklist_lock);
 unlock_creds:
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
 out:
+        if (wait_trap)
+                wait_event(current->signal->wait_chldexit,
+                           !(task->group_stop & GROUP_STOP_TRAPPING));
        return retval;
 }
@@ -219,7 +269,7 @@ out:
 * Performs checks and sets PT_PTRACED.
 * Should be used by all ptrace implementations for PTRACE_TRACEME.
 */
-int ptrace_traceme(void)
+static int ptrace_traceme(void)
 {
        int ret = -EPERM;
@@ -293,7 +343,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
        return false;
 }
-int ptrace_detach(struct task_struct *child, unsigned int data)
+static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
        bool dead = false;
@@ -312,8 +362,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
        if (child->ptrace) {
                child->exit_code = data;
                dead = __ptrace_detach(current, child);
-                if (!child->exit_state)
-                        wake_up_process(child);
        }
        write_unlock_irq(&tasklist_lock);
@@ -329,6 +377,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 * and reacquire the lock.
 */
 void exit_ptrace(struct task_struct *tracer)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct task_struct *p, *n;
        LIST_HEAD(ptrace_dead);
@@ -402,7 +452,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
        return copied;
 }
-static int ptrace_setoptions(struct task_struct *child, long data)
+static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
        child->ptrace &= ~PT_TRACE_MASK;
@@ -481,7 +531,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
 #define is_sysemu_singlestep(request)   0
 #endif
-static int ptrace_resume(struct task_struct *child, long request, long data)
+static int ptrace_resume(struct task_struct *child, long request,
+                         unsigned long data)
 {
        if (!valid_signal(data))
                return -EIO;
@@ -511,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
        }
        child->exit_code = data;
-        wake_up_process(child);
+        wake_up_state(child, __TASK_TRACED);
        return 0;
 }
@@ -558,10 +609,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 #endif
 int ptrace_request(struct task_struct *child, long request,
-                   long addr, long data)
+                   unsigned long addr, unsigned long data)
 {
        int ret = -EIO;
        siginfo_t siginfo;
+        void __user *datavp = (void __user *) data;
+        unsigned long __user *datalp = datavp;
        switch (request) {
        case PTRACE_PEEKTEXT:
@@ -578,19 +631,17 @@ int ptrace_request(struct task_struct *child, long request,
                ret = ptrace_setoptions(child, data);
                break;
        case PTRACE_GETEVENTMSG:
-                ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+                ret = put_user(child->ptrace_message, datalp);
                break;
        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
-                        ret = copy_siginfo_to_user((siginfo_t __user *) data,
+                        ret = copy_siginfo_to_user(datavp, &siginfo);
-                                                   &siginfo);
                break;
        case PTRACE_SETSIGINFO:
-                if (copy_from_user(&siginfo, (siginfo_t __user *) data,
+                if (copy_from_user(&siginfo, datavp, sizeof siginfo))
-                                   sizeof siginfo))
                        ret = -EFAULT;
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +672,7 @@ int ptrace_request(struct task_struct *child, long request,
                }
                mmput(mm);
-                ret = put_user(tmp, (unsigned long __user *) data);
+                ret = put_user(tmp, datalp);
                break;
        }
 #endif
@@ -650,7 +701,7 @@ int ptrace_request(struct task_struct *child, long request,
        case PTRACE_SETREGSET:
        {
                struct iovec kiov;
-                struct iovec __user *uiov = (struct iovec __user *) data;
+                struct iovec __user *uiov = datavp;
                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
                        return -EFAULT;
@@ -691,7 +742,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
 #define arch_ptrace_attach(child)       do { } while (0)
 #endif
-SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
+                unsigned long, data)
 {
        struct task_struct *child;
        long ret;
@@ -732,7 +784,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
        return ret;
 }
-int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
+                            unsigned long data)
 {
        unsigned long tmp;
        int copied;
@@ -743,7 +796,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
        return put_user(tmp, (unsigned long __user *)data);
 }
-int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
+                            unsigned long data)
 {
        int copied;
@@ -870,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        return ret;
 }
 #endif  /* CONFIG_COMPAT */
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+int ptrace_get_breakpoints(struct task_struct *tsk)
+{
+        if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
+                return 0;
+        return -1;
+}
+void ptrace_put_breakpoints(struct task_struct *tsk)
+{
+        if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
+                flush_ptrace_hw_breakpoint(tsk);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
 int clean_sort_range(struct range *range, int az)
 {
-        int i, j, k = az - 1, nr_range = 0;
+        int i, j, k = az - 1, nr_range = az;
        for (i = 0; i < k; i++) {
                if (range[i].end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 /**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
 *
 * Check for bottom half being disabled, which covers both the
 * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation.  This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
 *
 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
-        return in_softirq();
+        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -140,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * In !PREEMPT configurations, there is no way to tell if we are
+                 * in a RCU read-side critical section or not, so we never
+                 * attempt any fixup and just print a warning.
                 */
+#ifndef CONFIG_PREEMPT
+                WARN_ON_ONCE(1);
+                return 0;
+#endif
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
-                        WARN_ON(1);
+                        WARN_ON_ONCE(1);
                        return 0;
                }
                rcu_barrier();
@@ -182,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * In !PREEMPT configurations, there is no way to tell if we are
+                 * in a RCU read-side critical section or not, so we never
+                 * attempt any fixup and just print a warning.
                 */
+#ifndef CONFIG_PREEMPT
+                WARN_ON_ONCE(1);
+                return 0;
+#endif
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
-                        WARN_ON(1);
+                        WARN_ON_ONCE(1);
                        return 0;
                }
                rcu_barrier();
@@ -212,14 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * In !PREEMPT configurations, there is no way to tell if we are
+                 * in a RCU read-side critical section or not, so we never
+                 * attempt any fixup and just print a warning.
                 */
 #ifndef CONFIG_PREEMPT
-                WARN_ON(1);
+                WARN_ON_ONCE(1);
                return 0;
-#else
+#endif
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
-                        WARN_ON(1);
+                        WARN_ON_ONCE(1);
                        return 0;
                }
                rcu_barrier();
@@ -227,7 +246,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                rcu_barrier_bh();
                debug_object_free(head, &rcuhead_debug_descr);
                return 1;
-#endif
        default:
                return 0;
        }
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,29 +35,23 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/cpu.h>
+#include <linux/prefetch.h>
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+/* Forward declarations for rcutiny_plugin.h. */
+struct rcu_ctrlblk;
+static void invoke_rcu_kthread(void);
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
+static void __call_rcu(struct rcu_head *head,
+                       void (*func)(struct rcu_head *rcu),
+                       struct rcu_ctrlblk *rcp);
-/* Global control variables for rcupdate callback mechanism. */
+#include "rcutiny_plugin.h"
-struct rcu_ctrlblk {
-        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
-        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
-        struct rcu_head **curtail;      /* ->next pointer of last CB. */
-};
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-        .donetail       = &rcu_sched_ctrlblk.rcucblist,
-        .curtail        = &rcu_sched_ctrlblk.rcucblist,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-        .donetail       = &rcu_bh_ctrlblk.rcucblist,
-        .curtail        = &rcu_bh_ctrlblk.rcucblist,
-};
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #ifdef CONFIG_NO_HZ
@@ -86,36 +80,45 @@ void rcu_exit_nohz(void)
 #endif /* #ifdef CONFIG_NO_HZ */
 /*
- * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
+ * Helper function for rcu_sched_qs() and rcu_bh_qs().
- * Also disable irqs to avoid confusion due to interrupt handlers
+ * Also irqs are disabled to avoid confusion due to interrupt handlers
 * invoking call_rcu().
 */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
-        unsigned long flags;
-        local_irq_save(flags);
        if (rcp->rcucblist != NULL &&
            rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
-                local_irq_restore(flags);
                return 1;
        }
-        local_irq_restore(flags);
        return 0;
 }
 /*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+}
+/*
 * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
 * are at it, given that any rcu quiescent state is also an rcu_bh
 * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
 */
 void rcu_sched_qs(int cpu)
 {
+        unsigned long flags;
+        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
+        local_irq_restore(flags);
 }
 /*
@@ -123,8 +126,12 @@ void rcu_sched_qs(int cpu)
 */
 void rcu_bh_qs(int cpu)
 {
+        unsigned long flags;
+        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
+        local_irq_restore(flags);
 }
 /*
@@ -140,16 +147,18 @@ void rcu_check_callbacks(int cpu, int user)
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
+        rcu_preempt_check_callbacks();
 }
 /*
- * Helper function for rcu_process_callbacks() that operates on the
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
- * specified rcu_ctrlkblk structure.
+ * whose grace period has elapsed.
 */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
        struct rcu_head *next, *list;
        unsigned long flags;
+        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail)
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        *rcp->donetail = NULL;
        if (rcp->curtail == rcp->donetail)
                rcp->curtail = &rcp->rcucblist;
+        rcu_preempt_remove_callbacks(rcp);
        rcp->donetail = &rcp->rcucblist;
        local_irq_restore(flags);
@@ -170,18 +180,45 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                list->func(list);
+                local_bh_disable();
+                __rcu_reclaim(list);
+                local_bh_enable();
                list = next;
+                RCU_TRACE(cb_count++);
        }
+        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
 */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
 {
-        __rcu_process_callbacks(&rcu_sched_ctrlblk);
+        unsigned long work;
-        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        unsigned long morework;
+        unsigned long flags;
+        for (;;) {
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work) {
+                        rcu_process_callbacks(&rcu_sched_ctrlblk);
+                        rcu_process_callbacks(&rcu_bh_ctrlblk);
+                        rcu_preempt_process_callbacks();
+                }
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
 }
 /*
@@ -219,19 +256,20 @@ static void __call_rcu(struct rcu_head *head,
        local_irq_save(flags);
        *rcp->curtail = head;
        rcp->curtail = &head->next;
+        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
 }
 /*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
 * period.  But since we have but one CPU, that would be after any
 * quiescent state.
 */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
        __call_rcu(head, func, &rcu_sched_ctrlblk);
 }
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +281,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
 void rcu_barrier_bh(void)
 {
        struct rcu_synchronize rcu;
@@ -285,9 +309,16 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        struct sched_param sp;
-}
-#include "rcutiny_plugin.h"
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
 * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -17,23 +17,991 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
 *
 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+        .donetail       = &rcu_sched_ctrlblk.rcucblist,
+        .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+        .donetail       = &rcu_bh_ctrlblk.rcucblist,
+        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_TINY_PREEMPT_RCU
+#include <linux/delay.h>
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+        struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+        struct rcu_head **nexttail;
+                                /* Tasks blocked in a preemptible RCU */
+                                /*  read-side critical section while an */
+                                /*  preemptible-RCU grace period is in */
+                                /*  progress must wait for a later grace */
+                                /*  period.  This pointer points to the */
+                                /*  ->next pointer of the last task that */
+                                /*  must wait for a later grace period, or */
+                                /*  to &->rcb.rcucblist if there is no */
+                                /*  such task. */
+        struct list_head blkd_tasks;
+                                /* Tasks blocked in RCU read-side critical */
+                                /*  section.  Tasks are placed at the head */
+                                /*  of this list and age towards the tail. */
+        struct list_head *gp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current grace period, or NULL if there */
+                                /*  is no such task. */
+        struct list_head *exp_tasks;
+                                /* Pointer to first task blocking the */
+                                /*  current expedited grace period, or NULL */
+                                /*  if there is no such task.  If there */
+                                /*  is no current expedited grace period, */
+                                /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+        struct list_head *boost_tasks;
+                                /* Pointer to first task that needs to be */
+                                /*  priority-boosted, or NULL if no priority */
+                                /*  boosting is needed.  If there is no */
+                                /*  current or expedited grace period, there */
+                                /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        u8 gpnum;               /* Current grace period. */
+        u8 gpcpu;               /* Last grace period blocked by the CPU. */
+        u8 completed;           /* Last grace period completed. */
+                                /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+        unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+        unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+        unsigned long n_tasks_boosted;
+                                /* Total number of tasks boosted. */
+        unsigned long n_exp_boosts;
+                                /* Number of tasks boosted for expedited GP. */
+        unsigned long n_normal_boosts;
+                                /* Number of tasks boosted for normal GP. */
+        unsigned long n_balk_blkd_tasks;
+                                /* Refused to boost: no blocked tasks. */
+        unsigned long n_balk_exp_gp_tasks;
+                                /* Refused to boost: nothing blocking GP. */
+        unsigned long n_balk_boost_tasks;
+                                /* Refused to boost: already boosting. */
+        unsigned long n_balk_notyet;
+                                /* Refused to boost: not yet time. */
+        unsigned long n_balk_nos;
+                                /* Refused to boost: not sure why, though. */
+                                /*  This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+        .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+        return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+/*
+ * Check for a running RCU reader.  Because there is only one CPU,
+ * there can be but one running RCU reader at a time.  ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+        return current->rcu_read_lock_nesting;
+}
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+        return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+        return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+        return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+        return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+        struct list_head *np;
+        np = t->rcu_node_entry.next;
+        if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                np = NULL;
+        return np;
+}
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+        seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                   rcu_preempt_ctrlblk.rcb.qlen,
+                   rcu_preempt_ctrlblk.n_grace_periods,
+                   rcu_preempt_ctrlblk.gpnum,
+                   rcu_preempt_ctrlblk.gpcpu,
+                   rcu_preempt_ctrlblk.completed,
+                   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                   "             ",
+                   "B."[!rcu_preempt_ctrlblk.boost_tasks],
+                   rcu_preempt_ctrlblk.n_tasks_boosted,
+                   rcu_preempt_ctrlblk.n_exp_boosts,
+                   rcu_preempt_ctrlblk.n_normal_boosts,
+                   (int)(jiffies & 0xffff),
+                   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
+                   "             balk",
+                   rcu_preempt_ctrlblk.n_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
+                   rcu_preempt_ctrlblk.n_balk_boost_tasks,
+                   rcu_preempt_ctrlblk.n_balk_notyet,
+                   rcu_preempt_ctrlblk.n_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+#ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+        unsigned long flags;
+        struct rt_mutex mtx;
+        struct task_struct *t;
+        struct list_head *tb;
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.exp_tasks == NULL)
+                return 0;  /* Nothing to boost. */
+        raw_local_irq_save(flags);
+        /*
+         * Recheck with irqs disabled: all tasks in need of boosting
+         * might exit their RCU read-side critical sections on their own
+         * if we are preempted just before disabling irqs.
+         */
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.exp_tasks == NULL) {
+                raw_local_irq_restore(flags);
+                return 0;
+        }
+        /*
+         * Preferentially boost tasks blocking expedited grace periods.
+         * This cannot starve the normal grace periods because a second
+         * expedited grace period must boost all blocked tasks, including
+         * those blocking the pre-existing normal grace period.
+         */
+        if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
+                tb = rcu_preempt_ctrlblk.exp_tasks;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+        } else {
+                tb = rcu_preempt_ctrlblk.boost_tasks;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+        }
+        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+        /*
+         * We boost task t by manufacturing an rt_mutex that appears to
+         * be held by task t.  We leave a pointer to that rt_mutex where
+         * task t can find it, and task t will release the mutex when it
+         * exits its outermost RCU read-side critical section.  Then
+         * simply acquiring this artificial rt_mutex will boost task
+         * t's priority.  (Thanks to tglx for suggesting this approach!)
+         */
+        t = container_of(tb, struct task_struct, rcu_node_entry);
+        rt_mutex_init_proxy_locked(&mtx, t);
+        t->rcu_boost_mutex = &mtx;
+        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+        raw_local_irq_restore(flags);
+        rt_mutex_lock(&mtx);
+        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
+        return rcu_preempt_ctrlblk.boost_tasks != NULL ||
+               rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+        if (!rcu_preempt_blocked_readers_cgp() &&
+            rcu_preempt_ctrlblk.exp_tasks == NULL) {
+                RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
+                return 0;
+        }
+        if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
+            (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+             rcu_preempt_ctrlblk.boost_tasks == NULL &&
+             ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
+                if (rcu_preempt_ctrlblk.exp_tasks == NULL)
+                        rcu_preempt_ctrlblk.boost_tasks =
+                                rcu_preempt_ctrlblk.gp_tasks;
+                invoke_rcu_kthread();
+        } else
+                RCU_TRACE(rcu_initiate_boost_trace());
+        return 1;
+}
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+        rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+        return rcu_preempt_blocked_readers_cgp();
+}
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state.  The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time.  Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period.  (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+        /* Record both CPU and task as having responded to current GP. */
+        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        /* If there is no GP then there is nothing more to do.  */
+        if (!rcu_preempt_gp_in_progress())
+                return;
+        /*
+         * Check up on boosting.  If there are readers blocking the
+         * current grace period, leave.
+         */
+        if (rcu_initiate_boost())
+                return;
+        /* Advance callbacks. */
+        rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+        rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+        rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+        /* If there are no blocked readers, next GP is done instantly. */
+        if (!rcu_preempt_blocked_readers_any())
+                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+        /* If there are done callbacks, cause them to be invoked. */
+        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+                invoke_rcu_kthread();
+}
+/*
+ * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+        if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+                /* Official start of GP. */
+                rcu_preempt_ctrlblk.gpnum++;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+                /* Any blocked RCU readers block new GP. */
+                if (rcu_preempt_blocked_readers_any())
+                        rcu_preempt_ctrlblk.gp_tasks =
+                                rcu_preempt_ctrlblk.blkd_tasks.next;
+                /* Set up for RCU priority boosting. */
+                rcu_preempt_boost_start_gp();
+                /* If there is no running reader, CPU is done with GP. */
+                if (!rcu_preempt_running_reader())
+                        rcu_preempt_cpu_qs();
+        }
+}
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section.  Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+        struct task_struct *t = current;
+        unsigned long flags;
+        local_irq_save(flags); /* must exclude scheduler_tick(). */
+        if (rcu_preempt_running_reader() &&
+            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+                /* Possibly blocking in an RCU read-side critical section. */
+                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+                /*
+                 * If this CPU has already checked in, then this task
+                 * will hold up the next grace period rather than the
+                 * current grace period.  Queue the task accordingly.
+                 * If the task is queued for the current grace period
+                 * (i.e., this CPU has not yet passed through a quiescent
+                 * state for the current grace period), then as long
+                 * as that task remains queued, the current grace period
+                 * cannot end.
+                 */
+                list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+                if (rcu_cpu_blocking_cur_gp())
+                        rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+        }
+        /*
+         * Either we were not in an RCU read-side critical section to
+         * begin with, or we have now recorded that critical section
+         * globally.  Either way, we can now note a quiescent state
+         * for this CPU.  Again, if we were in an RCU read-side critical
+         * section, and if that critical section was blocking the current
+         * grace period, then the fact that the task has been enqueued
+         * means that current grace period continues to be blocked.
+         */
+        rcu_preempt_cpu_qs();
+        local_irq_restore(flags);
+}
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+        int empty;
+        int empty_exp;
+        unsigned long flags;
+        struct list_head *np;
+        int special;
+        /*
+         * NMI handlers cannot block and cannot safely manipulate state.
+         * They therefore cannot possibly be special, so just leave.
+         */
+        if (in_nmi())
+                return;
+        local_irq_save(flags);
+        /*
+         * If RCU core is waiting for this CPU to exit critical section,
+         * let it know that we have done so.
+         */
+        special = t->rcu_read_unlock_special;
+        if (special & RCU_READ_UNLOCK_NEED_QS)
+                rcu_preempt_cpu_qs();
+        /* Hardware IRQ handlers cannot block. */
+        if (in_irq()) {
+                local_irq_restore(flags);
+                return;
+        }
+        /* Clean up if blocked during RCU read-side critical section. */
+        if (special & RCU_READ_UNLOCK_BLOCKED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+                /*
+                 * Remove this task from the ->blkd_tasks list and adjust
+                 * any pointers that might have been referencing it.
+                 */
+                empty = !rcu_preempt_blocked_readers_cgp();
+                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+                np = rcu_next_node_entry(t);
+                list_del_init(&t->rcu_node_entry);
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+                        rcu_preempt_ctrlblk.gp_tasks = np;
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+                        rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                        rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+                /*
+                 * If this was the last task on the current list, and if
+                 * we aren't waiting on the CPU, report the quiescent state
+                 * and start a new grace period if needed.
+                 */
+                if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+                        rcu_preempt_cpu_qs();
+                        rcu_preempt_start_gp();
+                }
+                /*
+                 * If this was the last task on the expedited lists,
+                 * then we need wake up the waiting task.
+                 */
+                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+                        rcu_report_exp_done();
+        }
+#ifdef CONFIG_RCU_BOOST
+        /* Unboost self if was boosted. */
+        if (special & RCU_READ_UNLOCK_BOOSTED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+                rt_mutex_unlock(t->rcu_boost_mutex);
+                t->rcu_boost_mutex = NULL;
+        }
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        local_irq_restore(flags);
+}
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+        --t->rcu_read_lock_nesting;
+        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+        if (t->rcu_read_lock_nesting == 0 &&
+            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere.  This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+        struct task_struct *t = current;
+        if (rcu_preempt_gp_in_progress() &&
+            (!rcu_preempt_running_reader() ||
+             !rcu_cpu_blocking_cur_gp()))
+                rcu_preempt_cpu_qs();
+        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+            rcu_preempt_ctrlblk.rcb.donetail)
+                invoke_rcu_kthread();
+        if (rcu_preempt_gp_in_progress() &&
+            rcu_cpu_blocking_cur_gp() &&
+            rcu_preempt_running_reader())
+                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from rcu_process_callbacks() to
+ * handle that case.  Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+        if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+                rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        unsigned long flags;
+        debug_rcu_head_queue(head);
+        head->func = func;
+        head->next = NULL;
+        local_irq_save(flags);
+        *rcu_preempt_ctrlblk.nexttail = head;
+        rcu_preempt_ctrlblk.nexttail = &head->next;
+        RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
+        rcu_preempt_start_gp();  /* checks to see if GP needed. */
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+void rcu_barrier(void)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!rcu_scheduler_active)
+                return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+        WARN_ON_ONCE(rcu_preempt_running_reader());
+        if (!rcu_preempt_blocked_readers_any())
+                return;
+        /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+        rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+        return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+        wake_up(&sync_rcu_preempt_exp_wq);
+}
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section.  Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list.  So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+        unsigned long flags;
+        struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+        unsigned long snap;
+        barrier(); /* ensure prior action seen before grace period. */
+        WARN_ON_ONCE(rcu_preempt_running_reader());
+        /*
+         * Acquire lock so that there is only one preemptible RCU grace
+         * period in flight.  Of course, if someone does the expedited
+         * grace period for us while we are acquiring the lock, just leave.
+         */
+        snap = sync_rcu_preempt_exp_count + 1;
+        mutex_lock(&sync_rcu_preempt_exp_mutex);
+        if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+                goto unlock_mb_ret; /* Others did our work for us. */
+        local_irq_save(flags);
+        /*
+         * All RCU readers have to already be on blkd_tasks because
+         * we cannot legally be executing in an RCU read-side critical
+         * section.
+         */
+        /* Snapshot current head of ->blkd_tasks list. */
+        rpcp->exp_tasks = rpcp->blkd_tasks.next;
+        if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+                rpcp->exp_tasks = NULL;
+        /* Wait for tail of ->blkd_tasks list to drain. */
+        if (!rcu_preempted_readers_exp())
+                local_irq_restore(flags);
+        else {
+                rcu_initiate_boost();
+                local_irq_restore(flags);
+                wait_event(sync_rcu_preempt_exp_wq,
+                           !rcu_preempted_readers_exp());
+        }
+        /* Clean up and exit. */
+        barrier(); /* ensure expedited GP seen before counter increment. */
+        sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+        mutex_unlock(&sync_rcu_preempt_exp_mutex);
+        barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+        if (!rcu_preempt_running_reader())
+                rcu_preempt_cpu_qs();
+        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting == 0)
+                return;
+        t->rcu_read_lock_nesting = 1;
+        __rcu_read_unlock();
+}
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 /*
 * During boot, we forgive RCU lockdep issues.  After this function is
 * invoked, we start taking RCU lockdep issues seriously.
 */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
        WARN_ON(nr_context_switches() > 0);
        rcu_scheduler_active = 1;
 }
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void)
+{
+        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+                rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
+        else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
+                 rcu_preempt_ctrlblk.exp_tasks == NULL)
+                rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
+        else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+                rcu_preempt_ctrlblk.n_balk_boost_tasks++;
+        else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+                rcu_preempt_ctrlblk.n_balk_notyet++;
+        else
+                rcu_preempt_ctrlblk.n_balk_nos++;
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        rcp->qlen -= n;
+        raw_local_irq_restore(flags);
+}
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+        show_tiny_preempt_stats(m);
+        seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+        seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+        return 0;
+}
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_tiny_stats, NULL);
+}
+static const struct file_operations show_tiny_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = show_tiny_stats_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static struct dentry *rcudir;
+static int __init rcutiny_trace_init(void)
+{
+        struct dentry *retval;
+        rcudir = debugfs_create_dir("rcu", NULL);
+        if (!rcudir)
+                goto free_out;
+        retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                     NULL, &show_tiny_stats_fops);
+        if (!retval)
+                goto free_out;
+        return 0;
+free_out:
+        debugfs_remove_recursive(rcudir);
+        return 1;
+}
+static void __exit rcutiny_trace_cleanup(void)
+{
+        debugfs_remove_recursive(rcudir);
+}
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,9 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;     /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -88,6 +91,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +118,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 #define RCU_TORTURE_PIPE_LEN 10
@@ -120,8 +130,8 @@ struct rcu_torture {
 };
 static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
-static long rcu_torture_current_version;
+static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -134,6 +144,10 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,14 +161,26 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
+#define rcu_can_boost() 1
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#define rcu_can_boost() 0
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+static unsigned long boost_starttime;   /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
+                                        /*  and boost task create/destroy. */
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 #define FULLSTOP_DONTSTOP 0     /* Normal operation. */
 #define FULLSTOP_SHUTDOWN 1     /* System shutdown with rcutorture running. */
 #define FULLSTOP_RMMOD    2     /* Normal rmmod of rcutorture. */
 static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex);   /* Protect fullstop transitions and spawning */
+/*
-                                /*  of kthreads. */
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
 /*
 * Detect and respond to a system shutdown.
@@ -275,6 +301,7 @@ struct rcu_torture_ops {
        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
+        int can_boost;
        char *name;
 };
@@ -303,6 +330,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
                mdelay(longdelay_ms);
        if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
                udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+                preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
 }
 static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -360,6 +391,7 @@ static struct rcu_torture_ops rcu_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu"
 };
@@ -402,6 +434,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_sync"
 };
@@ -418,6 +451,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_expedited"
 };
@@ -536,6 +570,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
        delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
        if (!delay)
                schedule_timeout_interruptible(longdelay);
+        else
+                rcu_read_delay(rrsp);
 }
 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -676,6 +712,112 @@ static struct rcu_torture_ops sched_expedited_ops = {
 };
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+struct rcu_boost_inflight {
+        struct rcu_head rcu;
+        int inflight;
+};
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+        struct rcu_boost_inflight *rbip =
+                container_of(head, struct rcu_boost_inflight, rcu);
+        smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+        rbip->inflight = 0;
+}
+static int rcu_torture_boost(void *arg)
+{
+        unsigned long call_rcu_time;
+        unsigned long endtime;
+        unsigned long oldstarttime;
+        struct rcu_boost_inflight rbi = { .inflight = 0 };
+        struct sched_param sp;
+        VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+        /* Set real-time priority. */
+        sp.sched_priority = 1;
+        if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+                VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+                n_rcu_torture_boost_rterror++;
+        }
+        init_rcu_head_on_stack(&rbi.rcu);
+        /* Each pass through the following loop does one boost-test cycle. */
+        do {
+                /* Wait for the next test interval. */
+                oldstarttime = boost_starttime;
+                while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                        schedule_timeout_uninterruptible(1);
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /* Do one boost-test interval. */
+                endtime = oldstarttime + test_boost_duration * HZ;
+                call_rcu_time = jiffies;
+                while (jiffies - endtime > ULONG_MAX / 2) {
+                        /* If we don't have a callback in flight, post one. */
+                        if (!rbi.inflight) {
+                                smp_mb(); /* RCU core before ->inflight = 1. */
+                                rbi.inflight = 1;
+                                call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+                                if (jiffies - call_rcu_time >
+                                         test_boost_duration * HZ - HZ / 2) {
+                                        VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                        n_rcu_torture_boost_failure++;
+                                }
+                                call_rcu_time = jiffies;
+                        }
+                        cond_resched();
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /*
+                 * Set the start time of the next test interval.
+                 * Yes, this is vulnerable to long delays, but such
+                 * delays simply cause a false negative for the next
+                 * interval.  Besides, we are running at RT priority,
+                 * so delays should be relatively rare.
+                 */
+                while (oldstarttime == boost_starttime) {
+                        if (mutex_trylock(&boost_mutex)) {
+                                boost_starttime = jiffies +
+                                                  test_boost_interval * HZ;
+                                n_rcu_torture_boosts++;
+                                mutex_unlock(&boost_mutex);
+                                break;
+                        }
+                        schedule_timeout_uninterruptible(1);
+                }
+                /* Go do the stutter. */
+checkwait:      rcu_stutter_wait("rcu_torture_boost");
+        } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+        /* Clean up and exit. */
+        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        destroy_rcu_head_on_stack(&rbi.rcu);
+        rcutorture_shutdown_absorb("rcu_torture_boost");
+        while (!kthread_should_stop() || rbi.inflight)
+                schedule_timeout_uninterruptible(1);
+        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+        return 0;
+}
+/*
 * RCU torture force-quiescent-state kthread.  Repeatedly induces
 * bursts of calls to force_quiescent_state(), increasing the probability
 * of occurrence of some important types of race conditions.
@@ -731,7 +873,8 @@ rcu_torture_writer(void *arg)
                        continue;
                rp->rtort_pipe_count = 0;
                udelay(rcu_random(&rand) & 0x3ff);
-                old_rp = rcu_torture_current;
+                old_rp = rcu_dereference_check(rcu_torture_current,
+                                               current == writer_task);
                rp->rtort_mbtest = 1;
                rcu_assign_pointer(rcu_torture_current, rp);
                smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
@@ -743,7 +886,7 @@ rcu_torture_writer(void *arg)
                        old_rp->rtort_pipe_count++;
                        cur_ops->deferred_free(old_rp);
                }
-                rcu_torture_current_version++;
+                rcutorture_record_progress(++rcu_torture_current_version);
                oldbatch = cur_ops->completed();
                rcu_stutter_wait("rcu_torture_writer");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -923,8 +1066,9 @@ rcu_torture_printk(char *page)
        }
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
-                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
+                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
-                       "rtmbe: %d nt: %ld",
+                       "rtmbe: %d rtbke: %ld rtbre: %ld "
+                       "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -932,8 +1076,15 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free),
                       atomic_read(&n_rcu_torture_mberror),
+                       n_rcu_torture_boost_ktrerror,
+                       n_rcu_torture_boost_rterror,
+                       n_rcu_torture_boost_failure,
+                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        if (atomic_read(&n_rcu_torture_mberror) != 0)
+        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+            n_rcu_torture_boost_ktrerror != 0 ||
+            n_rcu_torture_boost_rterror != 0 ||
+            n_rcu_torture_boost_failure != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
@@ -1085,28 +1236,98 @@ rcu_torture_stutter(void *arg)
 }
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                "shuffle_interval=%d stutter=%d irqreader=%d "
-                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+                "test_boost=%d/%d test_boost_interval=%d "
+                "test_boost_duration=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+                test_boost, cur_ops->can_boost,
+                test_boost_interval, test_boost_duration);
 }
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
        .notifier_call = rcutorture_shutdown_notify,
 };
+static void rcutorture_booster_cleanup(int cpu)
+{
+        struct task_struct *t;
+        if (boost_tasks[cpu] == NULL)
+                return;
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+        t = boost_tasks[cpu];
+        boost_tasks[cpu] = NULL;
+        mutex_unlock(&boost_mutex);
+        /* This must be outside of the mutex, otherwise deadlock! */
+        kthread_stop(t);
+}
+static int rcutorture_booster_init(int cpu)
+{
+        int retval;
+        if (boost_tasks[cpu] != NULL)
+                return 0;  /* Already created, nothing more to do. */
+        /* Don't allow time recalculation while creating a new task. */
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+                                          "rcu_torture_boost");
+        if (IS_ERR(boost_tasks[cpu])) {
+                retval = PTR_ERR(boost_tasks[cpu]);
+                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+                n_rcu_torture_boost_ktrerror++;
+                boost_tasks[cpu] = NULL;
+                mutex_unlock(&boost_mutex);
+                return retval;
+        }
+        kthread_bind(boost_tasks[cpu], cpu);
+        wake_up_process(boost_tasks[cpu]);
+        mutex_unlock(&boost_mutex);
+        return 0;
+}
+static int rcutorture_cpu_notify(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                (void)rcutorture_booster_init(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+                rcutorture_booster_cleanup(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block rcutorture_cpu_nb = {
+        .notifier_call = rcutorture_cpu_notify,
+};
 static void
 rcu_torture_cleanup(void)
 {
        int i;
        mutex_lock(&fullstop_mutex);
+        rcutorture_record_test_transition();
        if (fullstop == FULLSTOP_SHUTDOWN) {
                printk(KERN_WARNING /* but going down anyway, so... */
                       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1118,7 +1339,7 @@ rcu_torture_cleanup(void)
        }
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
-        unregister_reboot_notifier(&rcutorture_nb);
+        unregister_reboot_notifier(&rcutorture_shutdown_nb);
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1175,6 +1396,12 @@ rcu_torture_cleanup(void)
                kthread_stop(fqs_task);
        }
        fqs_task = NULL;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                unregister_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i)
+                        rcutorture_booster_cleanup(i);
+        }
        /* Wait for all RCU callbacks to fire.  */
@@ -1186,9 +1413,9 @@ rcu_torture_cleanup(void)
        if (cur_ops->cleanup)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
-                rcu_torture_print_module_parms("End of test: FAILURE");
+                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else
-                rcu_torture_print_module_parms("End of test: SUCCESS");
+                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 static int __init
@@ -1233,7 +1460,7 @@ rcu_torture_init(void)
                nrealreaders = nreaders;
        else
                nrealreaders = 2 * num_online_cpus();
-        rcu_torture_print_module_parms("Start of test");
+        rcu_torture_print_module_parms(cur_ops, "Start of test");
        fullstop = FULLSTOP_DONTSTOP;
        /* Set up the freelist. */
@@ -1254,6 +1481,10 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
+        n_rcu_torture_boost_ktrerror = 0;
+        n_rcu_torture_boost_rterror = 0;
+        n_rcu_torture_boost_failure = 0;
+        n_rcu_torture_boosts = 0;
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
        for_each_possible_cpu(cpu) {
@@ -1367,7 +1598,28 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
-        register_reboot_notifier(&rcutorture_nb);
+        if (test_boost_interval < 1)
+                test_boost_interval = 1;
+        if (test_boost_duration < 2)
+                test_boost_duration = 2;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                int retval;
+                boost_starttime = jiffies + test_boost_interval * HZ;
+                register_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i) {
+                        if (cpu_is_offline(i))
+                                continue;  /* Heuristic: CPU can go offline. */
+                        retval = rcutorture_booster_init(i);
+                        if (retval < 0) {
+                                firsterr = retval;
+                                goto unwind;
+                        }
+                }
+        }
+        register_reboot_notifier(&rcutorture_shutdown_nb);
+        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
@@ -47,6 +47,9 @@
 #include <linux/mutex.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
 #include "rcutree.h"
@@ -67,9 +70,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-        .orphan_cbs_list = NULL, \
-        .orphan_cbs_tail = &structname.orphan_cbs_list, \
-        .orphan_qlen = 0, \
        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
@@ -82,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state *rcu_state;
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned.  So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimized synchronize_sched() to a simple barrier().  When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods.  This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 /*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks.  So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one.  We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+#ifdef CONFIG_RCU_BOOST
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads.  These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+#define RCU_KTHREAD_PRIO 1      /* RT priority for per-CPU kthreads. */
+/*
+ * Track the rcutorture test sequence number and the update version
+ * number within a given test.  The rcutorture_testseq is incremented
+ * on every rcutorture module load and unload, so has an odd value
+ * when a test is running.  The rcutorture_vernum is set to zero
+ * when rcutorture starts and is incremented on each rcutorture update.
+ * These variables enable correlating rcutorture output with the
+ * RCU tracing information.
+ */
+unsigned long rcutorture_testseq;
+unsigned long rcutorture_vernum;
+/*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
 * structure's ->lock, but of course results can be subject to change.
@@ -127,11 +184,12 @@ void rcu_note_context_switch(int cpu)
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
 }
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = 1,
-        .dynticks = 1,
+        .dynticks = ATOMIC_INIT(1),
 };
 #endif /* #ifdef CONFIG_NO_HZ */
@@ -143,6 +201,9 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
+int rcu_cpu_stall_suppress __read_mostly;
+module_param(rcu_cpu_stall_suppress, int, 0644);
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
@@ -174,6 +235,31 @@ void rcu_bh_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 /*
+ * Record the number of times rcutorture tests have been initiated and
+ * terminated.  This information allows the debugfs tracing stats to be
+ * correlated to the rcutorture messages, even when the rcutorture module
+ * is being repeatedly loaded and unloaded.  In other words, we cannot
+ * store this state in rcutorture itself.
+ */
+void rcutorture_record_test_transition(void)
+{
+        rcutorture_testseq++;
+        rcutorture_vernum = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
+/*
+ * Record the number of writer passes through the current rcutorture test.
+ * This is also used to correlate debugfs tracing stats with the rcutorture
+ * messages.
+ */
+void rcutorture_record_progress(unsigned long vernum)
+{
+        rcutorture_vernum++;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+/*
 * Force a quiescent state for RCU-sched.
 */
 void rcu_sched_force_quiescent_state(void)
@@ -232,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
                return 1;
        }
-        /* If preemptable RCU, no point in sending reschedule IPI. */
+        /* If preemptible RCU, no point in sending reschedule IPI. */
-        if (rdp->preemptable)
+        if (rdp->preemptible)
                return 0;
        /* The CPU is online, so send it a reschedule IPI. */
@@ -262,13 +348,25 @@ void rcu_enter_nohz(void)
        unsigned long flags;
        struct rcu_dynticks *rdtp;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        rdtp->dynticks++;
+        if (--rdtp->dynticks_nesting) {
-        rdtp->dynticks_nesting--;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(rdtp->dynticks & 0x1);
+                return;
+        }
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+        smp_mb__before_atomic_inc();  /* See above. */
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
+        /* If the interrupt queued a callback, get out of dyntick mode. */
+        if (in_irq() &&
+            (__get_cpu_var(rcu_sched_data).nxtlist ||
+             __get_cpu_var(rcu_bh_data).nxtlist ||
+             rcu_preempt_needs_cpu(smp_processor_id())))
+                set_need_resched();
 }
 /*
@@ -284,11 +382,16 @@ void rcu_exit_nohz(void)
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        rdtp->dynticks++;
+        if (rdtp->dynticks_nesting++) {
-        rdtp->dynticks_nesting++;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+                return;
+        }
+        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
+        atomic_inc(&rdtp->dynticks);
+        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+        smp_mb__after_atomic_inc();  /* See above. */
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
        local_irq_restore(flags);
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 /**
@@ -302,11 +405,15 @@ void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks & 0x1)
+        if (rdtp->dynticks_nmi_nesting == 0 &&
+            (atomic_read(&rdtp->dynticks) & 0x1))
                return;
-        rdtp->dynticks_nmi++;
+        rdtp->dynticks_nmi_nesting++;
-        WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
+        smp_mb__before_atomic_inc();  /* Force delay from prior write. */
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+        atomic_inc(&rdtp->dynticks);
+        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+        smp_mb__after_atomic_inc();  /* See above. */
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 }
 /**
@@ -320,11 +427,14 @@ void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks & 0x1)
+        if (rdtp->dynticks_nmi_nesting == 0 ||
+            --rdtp->dynticks_nmi_nesting != 0)
                return;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        rdtp->dynticks_nmi++;
+        smp_mb__before_atomic_inc();  /* See above. */
-        WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force delay to next write. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 /**
@@ -335,13 +445,7 @@ void rcu_nmi_exit(void)
 */
 void rcu_irq_enter(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        rcu_exit_nohz();
-        if (rdtp->dynticks_nesting++)
-                return;
-        rdtp->dynticks++;
-        WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 /**
@@ -353,18 +457,7 @@ void rcu_irq_enter(void)
 */
 void rcu_irq_exit(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        rcu_enter_nohz();
-        if (--rdtp->dynticks_nesting)
-                return;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-        rdtp->dynticks++;
-        WARN_ON_ONCE(rdtp->dynticks & 0x1);
-        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (__get_cpu_var(rcu_sched_data).nxtlist ||
-            __get_cpu_var(rcu_bh_data).nxtlist)
-                set_need_resched();
 }
 #ifdef CONFIG_SMP
@@ -376,19 +469,8 @@ void rcu_irq_exit(void)
 */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-        int ret;
+        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-        int snap;
+        return 0;
-        int snap_nmi;
-        snap = rdp->dynticks->dynticks;
-        snap_nmi = rdp->dynticks->dynticks_nmi;
-        smp_mb();       /* Order sampling of snap with end of grace period. */
-        rdp->dynticks_snap = snap;
-        rdp->dynticks_nmi_snap = snap_nmi;
-        ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
-        if (ret)
-                rdp->dynticks_fqs++;
-        return ret;
 }
 /*
@@ -399,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-        long curr;
+        unsigned long curr;
-        long curr_nmi;
+        unsigned long snap;
-        long snap;
-        long snap_nmi;
-        curr = rdp->dynticks->dynticks;
+        curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-        snap = rdp->dynticks_snap;
+        snap = (unsigned long)rdp->dynticks_snap;
-        curr_nmi = rdp->dynticks->dynticks_nmi;
-        snap_nmi = rdp->dynticks_nmi_snap;
-        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -418,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-        if ((curr != snap || (curr & 0x1) == 0) &&
+        if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
-            (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -448,9 +524,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #endif /* #else #ifdef CONFIG_NO_HZ */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly;
-int rcu_cpu_stall_panicking __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
@@ -482,8 +556,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        /* OK, time to rat on our buddy... */
+        /*
+         * OK, time to rat on our buddy...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
               rsp->name);
        rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +589,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        /*
+         * OK, time to rat on ourselves...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
        trigger_all_cpu_backtrace();
@@ -527,31 +609,50 @@ static void print_cpu_stall(struct rcu_state *rsp)
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        long delta;
+        unsigned long j;
+        unsigned long js;
        struct rcu_node *rnp;
-        if (rcu_cpu_stall_panicking)
+        if (rcu_cpu_stall_suppress)
                return;
-        delta = jiffies - rsp->jiffies_stall;
+        j = ACCESS_ONCE(jiffies);
+        js = ACCESS_ONCE(rsp->jiffies_stall);
        rnp = rdp->mynode;
-        if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+        if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
-        } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
+        } else if (rcu_gp_in_progress(rsp) &&
+                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
-                /* They had two time units to dump stack, so complain. */
+                /* They had a few time units to dump stack, so complain. */
                print_other_cpu_stall(rsp);
        }
 }
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
-        rcu_cpu_stall_panicking = 1;
+        rcu_cpu_stall_suppress = 1;
        return NOTIFY_DONE;
 }
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+        rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_preempt_stall_reset();
+}
 static struct notifier_block rcu_panic_block = {
        .notifier_call = rcu_panic,
 };
@@ -561,22 +662,6 @@ static void __init check_cpu_stall_init(void)
        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
 }
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-static void record_gp_stall_check_time(struct rcu_state *rsp)
-{
-}
-static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-}
-static void __init check_cpu_stall_init(void)
-{
-}
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
 * Update CPU-local rcu_data state to record the newly noticed grace period.
 * This is used both when we started the grace period and when we notice
@@ -587,9 +672,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        if (rdp->gpnum != rnp->gpnum) {
-                rdp->qs_pending = 1;
+                /*
-                rdp->passed_quiesc = 0;
+                 * If the current grace period is waiting for this CPU,
+                 * set up to detect a quiescent state, otherwise don't
+                 * go looking for one.
+                 */
                rdp->gpnum = rnp->gpnum;
+                if (rnp->qsmask & rdp->grpmask) {
+                        rdp->qs_pending = 1;
+                        rdp->passed_quiesc = 0;
+                } else
+                        rdp->qs_pending = 0;
        }
 }
@@ -648,6 +741,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+                /*
+                 * If we were in an extended quiescent state, we may have
+                 * missed some grace periods that others CPUs handled on
+                 * our behalf. Catch up with this state to avoid noting
+                 * spurious new grace periods.  If another grace period
+                 * has started, then rnp->gpnum will have advanced, so
+                 * we will detect this later on.
+                 */
+                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+                        rdp->gpnum = rdp->completed;
+                /*
+                 * If RCU does not need a quiescent state from this CPU,
+                 * then make sure that this CPU doesn't go looking for one.
+                 */
+                if ((rnp->qsmask & rdp->grpmask) == 0)
+                        rdp->qs_pending = 0;
        }
 }
@@ -712,7 +823,7 @@ static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -753,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->completed = rsp->completed;
                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                rcu_preempt_boost_start_gp(rnp);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -788,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->completed = rsp->completed;
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                rcu_preempt_boost_start_gp(rnp);
                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
@@ -808,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
+        unsigned long gp_duration;
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+        /*
+         * Ensure that all grace-period and pre-grace-period activity
+         * is seen before the assignment to rsp->completed.
+         */
+        smp_mb(); /* See above block comment. */
+        gp_duration = jiffies - rsp->gp_start;
+        if (gp_duration > rsp->gp_max)
+                rsp->gp_max = gp_duration;
        rsp->completed = rsp->gpnum;
        rsp->signaled = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
@@ -838,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                rnp->qsmask &= ~mask;
-                if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                        /* Other bits still set at this level, so done. */
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -951,65 +1075,49 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * specified flavor of RCU.  The callbacks will be adopted by the next
+ * Synchronization is not required because this function executes
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
+ * in stop_machine() context.
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
 */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
        int i;
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        /* current DYING CPU is cleared in the cpu_online_mask */
+        int receive_cpu = cpumask_any(cpu_online_mask);
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-        *rsp->orphan_cbs_tail = rdp->nxtlist;
+        *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->qlen += rdp->qlen;
+        receive_rdp->n_cbs_adopted += rdp->qlen;
+        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
-        rsp->orphan_qlen += rdp->qlen;
        rdp->qlen = 0;
-        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-        unsigned long flags;
-        struct rcu_data *rdp;
-        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        rdp = rsp->rda[smp_processor_id()];
-        if (rsp->orphan_cbs_list == NULL) {
-                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-                return;
-        }
-        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-        rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-        rdp->qlen += rsp->orphan_qlen;
-        rsp->orphan_cbs_list = NULL;
-        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-        rsp->orphan_qlen = 0;
-        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
 * and move all callbacks from the outgoing CPU to the current one.
+ * There can only be one CPU hotplug operation at a time, so no other
+ * CPU can be attempting to update rcu_cpu_kthread_task.
 */
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        unsigned long mask;
        int need_report = 0;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp;
+        rcu_stop_cpu_kthread(cpu);
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1046,8 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
+        rcu_node_kthread_setaffinity(rnp, -1);
-        rcu_adopt_orphan_cbs(rsp);
 }
 /*
@@ -1065,11 +1172,7 @@ static void rcu_offline_cpu(int cpu)
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-{
-}
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 {
 }
@@ -1113,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                list->func(list);
+                __rcu_reclaim(list);
                list = next;
                if (++count >= rdp->blimit)
                        break;
@@ -1123,6 +1226,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
+        rdp->n_cbs_invoked += count;
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
@@ -1148,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Re-raise the RCU softirq if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_core();
 }
 /*
@@ -1194,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
        }
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_core();
 }
 #ifdef CONFIG_SMP
@@ -1202,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user)
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
+ * Also initiate boosting for any threads blocked on the root rcu_node.
+ *
 * The caller must have suppressed start of new grace periods.
 */
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1220,13 +1326,14 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                        return;
                }
                if (rnp->qsmask == 0) {
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
                        continue;
                }
                cpu = rnp->grplo;
                bit = 1;
                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-                        if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+                        if ((rnp->qsmask & bit) != 0 &&
+                            f(per_cpu_ptr(rsp->rda, cpu)))
                                mask |= bit;
                }
                if (mask != 0) {
@@ -1237,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
+        rnp = rcu_get_root(rsp);
+        if (rnp->qsmask == 0) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+        }
 }
 /*
@@ -1351,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* If there are callbacks ready, invoke them. */
-        rcu_do_batch(rsp, rdp);
+        if (cpu_has_callbacks_ready_to_invoke(rdp))
+                invoke_rcu_callbacks(rsp, rdp);
 }
 /*
@@ -1359,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-        /*
-         * Memory references from any prior RCU read-side critical sections
-         * executed by the interrupted code must be seen before any RCU
-         * grace-period manipulations below.
-         */
-        smp_mb(); /* See above block comment. */
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
        rcu_preempt_process_callbacks();
-        /*
-         * Memory references from any later RCU read-side critical sections
-         * executed by the interrupted code must be seen after any RCU
-         * grace-period manipulations above.
-         */
-        smp_mb(); /* See above block comment. */
        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
        rcu_needs_cpu_flush();
 }
+/*
+ * Wake up the current CPU's kthread.  This replaces raise_softirq()
+ * in earlier versions of RCU.  Note that because we are running on
+ * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * cannot disappear out from under us.
+ */
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+        if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+                return;
+        if (likely(!rsp->boost)) {
+                rcu_do_batch(rsp, rdp);
+                return;
+        }
+        invoke_rcu_callbacks_kthread();
+}
+static void invoke_rcu_core(void)
+{
+        raise_softirq(RCU_SOFTIRQ);
+}
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
           struct rcu_state *rsp)
@@ -1402,21 +1523,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * a quiescent state betweentimes.
         */
        local_irq_save(flags);
-        rdp = rsp->rda[smp_processor_id()];
+        rdp = this_cpu_ptr(rsp->rda);
-        rcu_process_gp_end(rsp, rdp);
-        check_for_new_grace_period(rsp, rdp);
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+        rdp->qlen++;
-        /* Start a new grace period if one not already started. */
+        /* If interrupts were disabled, don't dive into RCU core. */
-        if (!rcu_gp_in_progress(rsp)) {
+        if (irqs_disabled_flags(flags)) {
-                unsigned long nestflag;
+                local_irq_restore(flags);
-                struct rcu_node *rnp_root = rcu_get_root(rsp);
+                return;
-                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
        }
        /*
@@ -1426,13 +1543,28 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * invoking force_quiescent_state() if the newly enqueued callback
         * is the only one waiting for a grace period to complete.
         */
-        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-                rdp->blimit = LONG_MAX;
-                if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                /* Are we ignoring a completed grace period? */
-                    *rdp->nxttail[RCU_DONE_TAIL] != head)
+                rcu_process_gp_end(rsp, rdp);
-                        force_quiescent_state(rsp, 0);
+                check_for_new_grace_period(rsp, rdp);
-                rdp->n_force_qs_snap = rsp->n_force_qs;
-                rdp->qlen_last_fqs_check = rdp->qlen;
+                /* Start a new grace period if one not already started. */
+                if (!rcu_gp_in_progress(rsp)) {
+                        unsigned long nestflag;
+                        struct rcu_node *rnp_root = rcu_get_root(rsp);
+                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                } else {
+                        /* Give the grace period a kick. */
+                        rdp->blimit = LONG_MAX;
+                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                                force_quiescent_state(rsp, 0);
+                        rdp->n_force_qs_snap = rsp->n_force_qs;
+                        rdp->qlen_last_fqs_check = rdp->qlen;
+                }
        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1547,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                 * or RCU-bh, force a local reschedule.
                 */
                rdp->n_rp_qs_pending++;
-                if (!rdp->preemptable &&
+                if (!rdp->preemptible &&
                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
                                 jiffies))
                        set_need_resched();
@@ -1662,13 +1794,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
         * might complete its grace period before all of the other CPUs
         * did their increment, causing this function to return too
-         * early.
+         * early.  Note that on_each_cpu() disables irqs, which prevents
+         * any CPUs from coming online or going offline until each online
+         * CPU has queued its RCU-barrier callback.
         */
        atomic_set(&rcu_barrier_cpu_count, 1);
-        preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-        rcu_adopt_orphan_cbs(rsp);
        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-        preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                complete(&rcu_barrier_completion);
        wait_for_completion(&rcu_barrier_completion);
@@ -1701,7 +1832,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        int i;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
@@ -1725,11 +1856,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 * that this CPU cannot possibly have any RCU callbacks in flight yet.
 */
 static void __cpuinit
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 {
        unsigned long flags;
        unsigned long mask;
-        struct rcu_data *rdp = rsp->rda[cpu];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
@@ -1737,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
-        rdp->preemptable = preemptable;
+        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
@@ -1771,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
 {
        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
@@ -1785,27 +1916,34 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                rcu_online_cpu(cpu);
+                rcu_prepare_cpu(cpu);
+                rcu_prepare_kthreads(cpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                rcu_node_kthread_setaffinity(rnp, -1);
+                rcu_cpu_kthread_setrt(cpu, 1);
+                break;
+        case CPU_DOWN_PREPARE:
+                rcu_node_kthread_setaffinity(rnp, cpu);
+                rcu_cpu_kthread_setrt(cpu, 0);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /*
-                 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
+                 * The whole machine is "stopped" except this CPU, so we can
-                 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+                 * touch any data without introducing corruption. We send the
-                 * returns, all online cpus have queued rcu_barrier_func().
+                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
-                 * The dying CPU clears its cpu_online_mask bit and
-                 * moves all of its RCU callbacks to ->orphan_cbs_list
-                 * in the context of stop_machine(), so subsequent calls
-                 * to _rcu_barrier() will adopt these callbacks and only
-                 * then queue rcu_barrier_func() on all remaining CPUs.
                 */
-                rcu_send_cbs_to_orphanage(&rcu_bh_state);
+                rcu_send_cbs_to_online(&rcu_bh_state);
-                rcu_send_cbs_to_orphanage(&rcu_sched_state);
+                rcu_send_cbs_to_online(&rcu_sched_state);
-                rcu_preempt_send_cbs_to_orphanage();
+                rcu_preempt_send_cbs_to_online();
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1843,8 +1981,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+        rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -1865,7 +2004,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 /*
 * Helper function for rcu_init() that initializes one rcu_state structure.
 */
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+                struct rcu_data __percpu *rda)
 {
        static char *buf[] = { "rcu_node_level_0",
                               "rcu_node_level_1",
@@ -1911,46 +2051,29 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                                              j / rsp->levelspread[i - 1];
                        }
                        rnp->level = i;
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
+                        INIT_LIST_HEAD(&rnp->blkd_tasks);
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
                }
        }
+        rsp->rda = rda;
        rnp = rsp->level[NUM_RCU_LVLS - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
-                rsp->rda[i]->mynode = rnp;
+                per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i, rsp);
        }
 }
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
- * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
-        int i; \
-        \
-        for_each_possible_cpu(i) { \
-                (rsp)->rda[i] = &per_cpu(rcu_data, i); \
-        } \
-        rcu_init_one(rsp); \
-} while (0)
 void __init rcu_init(void)
 {
        int cpu;
        rcu_bootup_announce();
-        RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
+        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
-        RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
        /*
         * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
 /*
 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
 * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
+ * In practice, this did work well going from three levels to four.
- * bug somewhere.
+ * Of course, your mileage may vary.
 */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT            (CONFIG_RCU_FANOUT)
+#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_SQ         (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       16
-#define RCU_FANOUT_CUBE       (RCU_FANOUT_SQ * RCU_FANOUT)
+#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#if NR_CPUS <= RCU_FANOUT
+#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS        2
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS        3
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_3       NR_CPUS
+#  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS        4
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_4       NR_CPUS
+#  define NUM_RCU_LVL_4       (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -79,13 +84,19 @@
 * Dynticks per-CPU state.
 */
 struct rcu_dynticks {
-        int dynticks_nesting;   /* Track nesting level, sort of. */
+        int dynticks_nesting;   /* Track irq/process nesting level. */
-        int dynticks;           /* Even value for dynticks-idle, else odd. */
+        int dynticks_nmi_nesting; /* Track NMI nesting level. */
-        int dynticks_nmi;       /* Even value for either dynticks-idle or */
+        atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
-                                /*  not in nmi handler, else odd.  So this */
-                                /*  remains even for nmi from irq handler. */
 };
+/* RCU's kthread states for tracing. */
+#define RCU_KTHREAD_STOPPED  0
+#define RCU_KTHREAD_RUNNING  1
+#define RCU_KTHREAD_WAITING  2
+#define RCU_KTHREAD_OFFCPU   3
+#define RCU_KTHREAD_YIELDING 4
+#define RCU_KTHREAD_MAX      4
 /*
 * Definition for node within the RCU grace-period-detection hierarchy.
 */
@@ -104,10 +115,13 @@ struct rcu_node {
                                /*  an rcu_data structure, otherwise, each */
                                /*  bit corresponds to a child rcu_node */
                                /*  structure. */
-        unsigned long expmask;  /* Groups that have ->blocked_tasks[] */
+        unsigned long expmask;  /* Groups that have ->blkd_tasks */
                                /*  elements that need to drain to allow the */
                                /*  current expedited grace period to */
                                /*  complete (only for TREE_PREEMPT_RCU). */
+        atomic_t wakemask;      /* CPUs whose kthread needs to be awakened. */
+                                /*  Since this has meaning only for leaf */
+                                /*  rcu_node structures, 32 bits suffices. */
        unsigned long qsmaskinit;
                                /* Per-GP initial value for qsmask & expmask. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
@@ -117,11 +131,62 @@ struct rcu_node {
        u8      grpnum;         /* CPU/group number for next level up. */
        u8      level;          /* root is at level 0. */
        struct rcu_node *parent;
-        struct list_head blocked_tasks[4];
+        struct list_head blkd_tasks;
-                                /* Tasks blocked in RCU read-side critsect. */
+                                /* Tasks blocked in RCU read-side critical */
-                                /*  Grace period number (->gpnum) x blocked */
+                                /*  section.  Tasks are placed at the head */
-                                /*  by tasks on the (x & 0x1) element of the */
+                                /*  of this list and age towards the tail. */
-                                /*  blocked_tasks[] array. */
+        struct list_head *gp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current grace period, or NULL if there */
+                                /*  is no such task. */
+        struct list_head *exp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current expedited grace period, or NULL */
+                                /*  if there is no such task.  If there */
+                                /*  is no current expedited grace period, */
+                                /*  then there can cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+        struct list_head *boost_tasks;
+                                /* Pointer to first task that needs to be */
+                                /*  priority boosted, or NULL if no priority */
+                                /*  boosting is needed for this rcu_node */
+                                /*  structure.  If there are no tasks */
+                                /*  queued on this rcu_node structure that */
+                                /*  are blocking the current grace period, */
+                                /*  there can be no such task. */
+        unsigned long boost_time;
+                                /* When to start boosting (jiffies). */
+        struct task_struct *boost_kthread_task;
+                                /* kthread that takes care of priority */
+                                /*  boosting for this rcu_node structure. */
+        unsigned int boost_kthread_status;
+                                /* State of boost_kthread_task for tracing. */
+        unsigned long n_tasks_boosted;
+                                /* Total number of tasks boosted. */
+        unsigned long n_exp_boosts;
+                                /* Number of tasks boosted for expedited GP. */
+        unsigned long n_normal_boosts;
+                                /* Number of tasks boosted for normal GP. */
+        unsigned long n_balk_blkd_tasks;
+                                /* Refused to boost: no blocked tasks. */
+        unsigned long n_balk_exp_gp_tasks;
+                                /* Refused to boost: nothing blocking GP. */
+        unsigned long n_balk_boost_tasks;
+                                /* Refused to boost: already boosting. */
+        unsigned long n_balk_notblocked;
+                                /* Refused to boost: RCU RS CS still running. */
+        unsigned long n_balk_notyet;
+                                /* Refused to boost: not yet time. */
+        unsigned long n_balk_nos;
+                                /* Refused to boost: not sure why, though. */
+                                /*  This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        struct task_struct *node_kthread_task;
+                                /* kthread that takes care of this rcu_node */
+                                /*  structure, for example, awakening the */
+                                /*  per-CPU kthreads as needed. */
+        unsigned int node_kthread_status;
+                                /* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
 /*
@@ -170,7 +235,7 @@ struct rcu_data {
        bool            passed_quiesc;  /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
-        bool            preemptable;    /* Preemptable RCU? */
+        bool            preemptible;    /* Preemptible RCU? */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
@@ -202,6 +267,9 @@ struct rcu_data {
        long            qlen;           /* # of queued callbacks */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
+        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
+        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -210,7 +278,6 @@ struct rcu_data {
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
-        int dynticks_nmi_snap;          /* Per-GP tracking for dynticks_nmi. */
 #endif /* #ifdef CONFIG_NO_HZ */
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -246,7 +313,6 @@ struct rcu_data {
 #endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 #ifdef CONFIG_PROVE_RCU
 #define RCU_STALL_DELAY_DELTA          (5 * HZ)
@@ -254,19 +320,26 @@ struct rcu_data {
 #define RCU_STALL_DELAY_DELTA          0
 #endif
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+                                        RCU_STALL_DELAY_DELTA)
                                                /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
                                                /* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
                                                /*  before ratting on them. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#define rcu_wait(cond)                                                  \
+do {                                                                    \
-#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+        for (;;) {                                                      \
-#define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
+                set_current_state(TASK_INTERRUPTIBLE);                  \
+                if (cond)                                               \
+                        break;                                          \
+                schedule();                                             \
+        }                                                               \
+        __set_current_state(TASK_RUNNING);                              \
+} while (0)
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
@@ -283,7 +356,7 @@ struct rcu_state {
        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
-        struct rcu_data *rda[NR_CPUS];          /* array of rdp pointers. */
+        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -296,21 +369,14 @@ struct rcu_state {
                                                /*  period because */
                                                /*  force_quiescent_state() */
                                                /*  was running. */
+        u8      boost;                          /* Subject to priority boost. */
        unsigned long gpnum;                    /* Current gp number. */
        unsigned long completed;                /* # of last completed gp. */
        /* End of fields guarded by root rcu_node's lock. */
        raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                                /*  starting new GP.  Also */
+                                                /*  starting new GP. */
-                                                /*  protects the following */
-                                                /*  orphan_cbs fields. */
-        struct rcu_head *orphan_cbs_list;       /* list of rcu_head structs */
-                                                /*  orphaned by all CPUs in */
-                                                /*  a given leaf rcu_node */
-                                                /*  going offline. */
-        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
-        long orphan_qlen;                       /* Number of orphaned cbs. */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -321,12 +387,12 @@ struct rcu_state {
                                                /*  due to lock unavailable. */
        unsigned long n_force_qs_ngp;           /* Number of calls leaving */
                                                /*  due to no GP active. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
        unsigned long gp_start;                 /* Time at which GP started, */
                                                /*  but in jiffies. */
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+        unsigned long gp_max;                   /* Maximum GP duration in */
+                                                /*  jiffies. */
        char *name;                             /* Name of structure. */
 };
@@ -357,15 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
-static int rcu_preempted_readers(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                      unsigned long flags);
+static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -382,8 +448,23 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static void invoke_rcu_callbacks_kthread(void);
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+                                          cpumask_var_t cm);
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+                                                 struct rcu_node *rnp,
+                                                 int rnp_index);
+static void invoke_rcu_node_kthread(struct rcu_node *rnp);
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
+static void __cpuinit rcu_prepare_kthreads(int cpu);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
 /*
 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
 * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -25,6 +25,7 @@
 */
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 /*
 * Check the RCU kernel configuration parameters and print informative
@@ -53,11 +54,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
 #endif
-#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
-        printk(KERN_INFO
-               "\tRCU-based detection of stalled CPUs is disabled.\n");
-#endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
@@ -69,7 +66,9 @@ static void __init rcu_bootup_announce_oddness(void)
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+static struct rcu_state *rcu_state = &rcu_preempt_state;
+static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
@@ -77,7 +76,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 */
 static void __init rcu_bootup_announce(void)
 {
-        printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+        printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
        rcu_bootup_announce_oddness();
 }
@@ -110,7 +109,7 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
- * Record a preemptable-RCU quiescent state for the specified CPU.  Note
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
 * while in an RCU read-side critical section.
@@ -133,12 +132,12 @@ static void rcu_preempt_qs(int cpu)
 * We have entered the scheduler, and the current task might soon be
 * context-switched away from.  If this task is in an RCU read-side
 * critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the appropriate entry
+ * record that fact, so we enqueue the task on the blkd_tasks list.
- * of the blocked_tasks[] array.  The task will dequeue itself when
+ * The task will dequeue itself when it exits the outermost enclosing
- * it exits the outermost enclosing RCU read-side critical section.
+ * RCU read-side critical section.  Therefore, the current grace period
- * Therefore, the current grace period cannot be permitted to complete
+ * cannot be permitted to complete until the blkd_tasks list entries
- * until the blocked_tasks[] entry indexed by the low-order bit of
+ * predating the current grace period drain, in other words, until
- * rnp->gpnum empties.
+ * rnp->gp_tasks becomes NULL.
 *
 * Caller must disable preemption.
 */
@@ -146,15 +145,14 @@ static void rcu_preempt_note_context_switch(int cpu)
 {
        struct task_struct *t = current;
        unsigned long flags;
-        int phase;
        struct rcu_data *rdp;
        struct rcu_node *rnp;
-        if (t->rcu_read_lock_nesting &&
+        if (t->rcu_read_lock_nesting > 0 &&
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
-                rdp = rcu_preempt_state.rda[cpu];
+                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -168,16 +166,39 @@ static void rcu_preempt_note_context_switch(int cpu)
                 * (i.e., this CPU has not yet passed through a quiescent
                 * state for the current grace period), then as long
                 * as that task remains queued, the current grace period
-                 * cannot end.
+                 * cannot end.  Note that there is some uncertainty as
+                 * to exactly when the current grace period started.
+                 * We take a conservative approach, which can result
+                 * in unnecessarily waiting on tasks that started very
+                 * slightly after the current grace period began.  C'est
+                 * la vie!!!
                 *
                 * But first, note that the current CPU must still be
                 * on line!
                 */
                WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
-                phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
+                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
-                list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
+                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
+                        rnp->gp_tasks = &t->rcu_node_entry;
+#ifdef CONFIG_RCU_BOOST
+                        if (rnp->boost_tasks != NULL)
+                                rnp->boost_tasks = rnp->gp_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+                } else {
+                        list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+                        if (rnp->qsmask & rdp->grpmask)
+                                rnp->gp_tasks = &t->rcu_node_entry;
+                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        } else if (t->rcu_read_lock_nesting < 0 &&
+                   t->rcu_read_unlock_special) {
+                /*
+                 * Complete exit from RCU read-side critical section on
+                 * behalf of preempted instance of __rcu_read_unlock().
+                 */
+                rcu_read_unlock_special(t);
        }
        /*
@@ -195,13 +216,13 @@ static void rcu_preempt_note_context_switch(int cpu)
 }
 /*
- * Tree-preemptable RCU implementation for rcu_read_lock().
+ * Tree-preemptible RCU implementation for rcu_read_lock().
 * Just increment ->rcu_read_lock_nesting, shared state will be updated
 * if we block.
 */
 void __rcu_read_lock(void)
 {
-        ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+        current->rcu_read_lock_nesting++;
        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -211,12 +232,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
 * for the specified rcu_node structure.  If the caller needs a reliable
 * answer, it must hold the rcu_node's ->lock.
 */
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
-        int phase = rnp->gpnum & 0x1;
+        return rnp->gp_tasks != NULL;
-        return !list_empty(&rnp->blocked_tasks[phase]) ||
-               !list_empty(&rnp->blocked_tasks[phase + 2]);
 }
 /*
@@ -232,7 +250,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        unsigned long mask;
        struct rcu_node *rnp_p;
-        if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+        if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;  /* Still need more quiescent states! */
        }
@@ -256,15 +274,31 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 }
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t,
+                                             struct rcu_node *rnp)
+{
+        struct list_head *np;
+        np = t->rcu_node_entry.next;
+        if (np == &rnp->blkd_tasks)
+                np = NULL;
+        return np;
+}
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
        unsigned long flags;
+        struct list_head *np;
        struct rcu_node *rnp;
        int special;
@@ -284,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
        }
        /* Hardware IRQ handlers cannot block. */
-        if (in_irq()) {
+        if (in_irq() || in_serving_softirq()) {
                local_irq_restore(flags);
                return;
        }
@@ -305,10 +339,24 @@ static void rcu_read_unlock_special(struct task_struct *t)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
-                empty = !rcu_preempted_readers(rnp);
+                empty = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
+                if (&t->rcu_node_entry == rnp->gp_tasks)
+                        rnp->gp_tasks = np;
+                if (&t->rcu_node_entry == rnp->exp_tasks)
+                        rnp->exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rnp->boost_tasks)
+                        rnp->boost_tasks = np;
+                /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+                if (t->rcu_boosted) {
+                        special |= RCU_READ_UNLOCK_BOOSTED;
+                        t->rcu_boosted = 0;
+                }
+#endif /* #ifdef CONFIG_RCU_BOOST */
                t->rcu_blocked_node = NULL;
                /*
@@ -321,6 +369,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                else
                        rcu_report_unblock_qs_rnp(rnp, flags);
+#ifdef CONFIG_RCU_BOOST
+                /* Unboost if we were boosted. */
+                if (special & RCU_READ_UNLOCK_BOOSTED) {
+                        rt_mutex_unlock(t->rcu_boost_mutex);
+                        t->rcu_boost_mutex = NULL;
+                }
+#endif /* #ifdef CONFIG_RCU_BOOST */
                /*
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
@@ -333,7 +389,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 }
 /*
- * Tree-preemptable RCU implementation for rcu_read_unlock().
+ * Tree-preemptible RCU implementation for rcu_read_unlock().
 * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -344,17 +400,26 @@ void __rcu_read_unlock(void)
        struct task_struct *t = current;
        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-        if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+        if (t->rcu_read_lock_nesting != 1)
-            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                --t->rcu_read_lock_nesting;
-                rcu_read_unlock_special(t);
+        else {
+                t->rcu_read_lock_nesting = INT_MIN;
+                barrier();  /* assign before ->rcu_read_unlock_special load */
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                        rcu_read_unlock_special(t);
+                barrier();  /* ->rcu_read_unlock_special load before assign */
+                t->rcu_read_lock_nesting = 0;
+        }
 #ifdef CONFIG_PROVE_LOCKING
-        WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+        {
+                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+        }
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 /*
@@ -364,18 +429,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 {
        unsigned long flags;
-        struct list_head *lp;
-        int phase;
        struct task_struct *t;
-        if (rcu_preempted_readers(rnp)) {
+        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                raw_spin_lock_irqsave(&rnp->lock, flags);
+                return;
-                phase = rnp->gpnum & 0x1;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-                lp = &rnp->blocked_tasks[phase];
+        t = list_entry(rnp->gp_tasks,
-                list_for_each_entry(t, lp, rcu_node_entry)
+                       struct task_struct, rcu_node_entry);
-                        sched_show_task(t);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                sched_show_task(t);
-        }
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -405,19 +468,25 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
-        struct list_head *lp;
-        int phase;
        struct task_struct *t;
-        if (rcu_preempted_readers(rnp)) {
+        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                phase = rnp->gpnum & 0x1;
+                return;
-                lp = &rnp->blocked_tasks[phase];
+        t = list_entry(rnp->gp_tasks,
-                list_for_each_entry(t, lp, rcu_node_entry)
+                       struct task_struct, rcu_node_entry);
-                        printk(" P%d", t->pid);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
-        }
+                printk(" P%d", t->pid);
 }
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
 /*
 * Check that the list of blocked tasks for the newly completed grace
@@ -425,10 +494,15 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
 * period that still has RCU readers blocked!  This function must be
 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 * must be held by the caller.
+ *
+ * Also, if there are blocked tasks on the list, they automatically
+ * block the newly created grace period, so set up ->gp_tasks accordingly.
 */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
-        WARN_ON_ONCE(rcu_preempted_readers(rnp));
+        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+        if (!list_empty(&rnp->blkd_tasks))
+                rnp->gp_tasks = rnp->blkd_tasks.next;
        WARN_ON_ONCE(rnp->qsmask);
 }
@@ -452,50 +526,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                                     struct rcu_node *rnp,
                                     struct rcu_data *rdp)
 {
-        int i;
        struct list_head *lp;
        struct list_head *lp_root;
        int retval = 0;
        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        struct task_struct *tp;
+        struct task_struct *t;
        if (rnp == rnp_root) {
                WARN_ONCE(1, "Last CPU thought to be offlined?");
                return 0;  /* Shouldn't happen: at least one CPU online. */
        }
-        WARN_ON_ONCE(rnp != rdp->mynode &&
-                     (!list_empty(&rnp->blocked_tasks[0]) ||
+        /* If we are on an internal node, complain bitterly. */
-                      !list_empty(&rnp->blocked_tasks[1]) ||
+        WARN_ON_ONCE(rnp != rdp->mynode);
-                      !list_empty(&rnp->blocked_tasks[2]) ||
-                      !list_empty(&rnp->blocked_tasks[3])));
        /*
-         * Move tasks up to root rcu_node.  Rely on the fact that the
+         * Move tasks up to root rcu_node.  Don't try to get fancy for
-         * root rcu_node can be at most one ahead of the rest of the
+         * this corner-case operation -- just put this node's tasks
-         * rcu_nodes in terms of gp_num value.  This fact allows us to
+         * at the head of the root node's list, and update the root node's
-         * move the blocked_tasks[] array directly, element by element.
+         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
+         * if non-NULL.  This might result in waiting for more tasks than
+         * absolutely necessary, but this is a good performance/complexity
+         * tradeoff.
         */
-        if (rcu_preempted_readers(rnp))
+        if (rcu_preempt_blocked_readers_cgp(rnp))
                retval |= RCU_OFL_TASKS_NORM_GP;
        if (rcu_preempted_readers_exp(rnp))
                retval |= RCU_OFL_TASKS_EXP_GP;
-        for (i = 0; i < 4; i++) {
+        lp = &rnp->blkd_tasks;
-                lp = &rnp->blocked_tasks[i];
+        lp_root = &rnp_root->blkd_tasks;
-                lp_root = &rnp_root->blocked_tasks[i];
+        while (!list_empty(lp)) {
-                while (!list_empty(lp)) {
+                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-                        tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
+                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-                        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+                list_del(&t->rcu_node_entry);
-                        list_del(&tp->rcu_node_entry);
+                t->rcu_blocked_node = rnp_root;
-                        tp->rcu_blocked_node = rnp_root;
+                list_add(&t->rcu_node_entry, lp_root);
-                        list_add(&tp->rcu_node_entry, lp_root);
+                if (&t->rcu_node_entry == rnp->gp_tasks)
-                        raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+                        rnp_root->gp_tasks = rnp->gp_tasks;
-                }
+                if (&t->rcu_node_entry == rnp->exp_tasks)
+                        rnp_root->exp_tasks = rnp->exp_tasks;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rnp->boost_tasks)
+                        rnp_root->boost_tasks = rnp->boost_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
        }
+#ifdef CONFIG_RCU_BOOST
+        /* In case root is being boosted and leaf is not. */
+        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+        if (rnp_root->boost_tasks != NULL &&
+            rnp_root->boost_tasks != rnp_root->gp_tasks)
+                rnp_root->boost_tasks = rnp_root->gp_tasks;
+        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        rnp->gp_tasks = NULL;
+        rnp->exp_tasks = NULL;
        return retval;
 }
 /*
- * Do CPU-offline processing for preemptable RCU.
+ * Do CPU-offline processing for preemptible RCU.
 */
 static void rcu_preempt_offline_cpu(int cpu)
 {
@@ -519,12 +611,13 @@ static void rcu_preempt_check_callbacks(int cpu)
                rcu_preempt_qs(cpu);
                return;
        }
-        if (per_cpu(rcu_preempt_data, cpu).qs_pending)
+        if (t->rcu_read_lock_nesting > 0 &&
+            per_cpu(rcu_preempt_data, cpu).qs_pending)
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
 /*
- * Process callbacks for preemptable RCU.
+ * Process callbacks for preemptible RCU.
 */
 static void rcu_preempt_process_callbacks(void)
 {
@@ -532,8 +625,17 @@ static void rcu_preempt_process_callbacks(void)
                                &__get_cpu_var(rcu_preempt_data));
 }
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void)
+{
+        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 /*
- * Queue a preemptable-RCU callback for invocation after a grace period.
+ * Queue a preemptible-RCU callback for invocation after a grace period.
 */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
@@ -546,9 +648,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
 *
 * Control will return to the caller some time after a full grace
 * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
+ * read-side critical sections have completed.  Note, however, that
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * upon return from synchronize_rcu(), the caller might well be executing
- * and may be nested.
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.  RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 */
 void synchronize_rcu(void)
 {
@@ -579,8 +683,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 */
 static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 {
-        return !list_empty(&rnp->blocked_tasks[2]) ||
+        return rnp->exp_tasks != NULL;
-               !list_empty(&rnp->blocked_tasks[3]);
 }
 /*
@@ -615,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        for (;;) {
-                if (!sync_rcu_preempt_exp_done(rnp))
+                if (!sync_rcu_preempt_exp_done(rnp)) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        break;
+                }
                if (rnp->parent == NULL) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        wake_up(&sync_rcu_preempt_exp_wq);
                        break;
                }
@@ -627,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                raw_spin_lock(&rnp->lock); /* irqs already disabled */
                rnp->expmask &= ~mask;
        }
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -640,13 +745,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
-        int must_wait;
+        unsigned long flags;
+        int must_wait = 0;
-        raw_spin_lock(&rnp->lock); /* irqs already disabled */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
+        if (list_empty(&rnp->blkd_tasks))
-        list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        must_wait = rcu_preempted_readers_exp(rnp);
+        else {
-        raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+                rnp->exp_tasks = rnp->blkd_tasks.next;
+                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
+                must_wait = 1;
+        }
        if (!must_wait)
                rcu_report_exp_rnp(rsp, rnp);
 }
@@ -654,9 +763,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 /*
 * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
 * is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blocked_tasks[] lists, move all entries from the first set of
+ * the ->blkd_tasks lists and wait for this list to drain.
- * ->blocked_tasks[] lists to the second set, and finally wait for this
- * second set to drain.
 */
 void synchronize_rcu_expedited(void)
 {
@@ -688,7 +795,7 @@ void synchronize_rcu_expedited(void)
        if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
                goto unlock_mb_ret; /* Others did our work for us. */
-        /* force all RCU readers onto blocked_tasks[]. */
+        /* force all RCU readers onto ->blkd_tasks lists. */
        synchronize_sched_expedited();
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -700,7 +807,7 @@ void synchronize_rcu_expedited(void)
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
-        /* Snapshot current state of ->blocked_tasks[] lists. */
+        /* Snapshot current state of ->blkd_tasks lists. */
        rcu_for_each_leaf_node(rsp, rnp)
                sync_rcu_preempt_exp_init(rsp, rnp);
        if (NUM_RCU_NODES > 1)
@@ -708,7 +815,7 @@ void synchronize_rcu_expedited(void)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-        /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
+        /* Wait for snapshotted ->blkd_tasks lists to drain. */
        rnp = rcu_get_root(rsp);
        wait_event(sync_rcu_preempt_exp_wq,
                   sync_rcu_preempt_exp_done(rnp));
@@ -724,7 +831,7 @@ mb_ret:
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 /*
- * Check to see if there is any immediate preemptable-RCU-related work
+ * Check to see if there is any immediate preemptible-RCU-related work
 * to be done.
 */
 static int rcu_preempt_pending(int cpu)
@@ -734,7 +841,7 @@ static int rcu_preempt_pending(int cpu)
 }
 /*
- * Does preemptable RCU need the CPU to stay out of dynticks mode?
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
 */
 static int rcu_preempt_needs_cpu(int cpu)
 {
@@ -751,7 +858,7 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Initialize preemptable RCU's per-CPU data.
+ * Initialize preemptible RCU's per-CPU data.
 */
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 {
@@ -759,23 +866,23 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptible RCU's callbacks from dying CPU to other online CPU.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-        rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+        rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 /*
- * Initialize preemptable RCU's state structures.
+ * Initialize preemptible RCU's state structures.
 */
 static void __init __rcu_init_preempt(void)
 {
-        RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
 /*
- * Check for a task exiting while in a preemptable-RCU read-side
+ * Check for a task exiting while in a preemptible-RCU read-side
 * critical section, clean up if so.  No need to issue warnings,
 * as debug_check_no_locks_held() already does this if lockdep
 * is enabled.
@@ -787,11 +894,13 @@ void exit_rcu(void)
        if (t->rcu_read_lock_nesting == 0)
                return;
        t->rcu_read_lock_nesting = 1;
-        rcu_read_unlock();
+        __rcu_read_unlock();
 }
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+static struct rcu_state *rcu_state = &rcu_sched_state;
 /*
 * Tell them what RCU they are running.
 */
@@ -821,7 +930,7 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
 static void rcu_preempt_note_context_switch(int cpu)
@@ -829,10 +938,10 @@ static void rcu_preempt_note_context_switch(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, there are never any preempted
+ * Because preemptible RCU does not exist, there are never any preempted
 * RCU readers.
 */
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
        return 0;
 }
@@ -847,10 +956,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 /*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -858,17 +965,23 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 }
 /*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
 /*
- * Because there is no preemptable RCU, there can be no readers blocked,
+ * Because there is no preemptible RCU, there can be no readers blocked,
 * so there is no need to check for blocked tasks.  So check only for
 * bogus qsmask values.
 */
@@ -880,7 +993,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Because preemptable RCU does not exist, it never needs to migrate
+ * Because preemptible RCU does not exist, it never needs to migrate
 * tasks that were blocked within RCU read-side critical sections, and
 * such non-existent tasks cannot possibly have been blocking the current
 * grace period.
@@ -893,7 +1006,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 }
 /*
- * Because preemptable RCU does not exist, it never needs CPU-offline
+ * Because preemptible RCU does not exist, it never needs CPU-offline
 * processing.
 */
 static void rcu_preempt_offline_cpu(int cpu)
@@ -903,7 +1016,7 @@ static void rcu_preempt_offline_cpu(int cpu)
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
 static void rcu_preempt_check_callbacks(int cpu)
@@ -911,7 +1024,7 @@ static void rcu_preempt_check_callbacks(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
 * to process.
 */
 static void rcu_preempt_process_callbacks(void)
@@ -919,17 +1032,8 @@ static void rcu_preempt_process_callbacks(void)
 }
 /*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-        call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-/*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptable RCU does not exist, map to rcu-sched.
+ * But because preemptible RCU does not exist, map to rcu-sched.
 */
 void synchronize_rcu_expedited(void)
 {
@@ -940,7 +1044,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Because preemptable RCU does not exist, there is never any need to
+ * Because preemptible RCU does not exist, there is never any need to
 * report on tasks preempted in RCU read-side critical sections during
 * expedited RCU grace periods.
 */
@@ -952,7 +1056,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptable RCU does not exist, it never has any work to do.
+ * Because preemptible RCU does not exist, it never has any work to do.
 */
 static int rcu_preempt_pending(int cpu)
 {
@@ -960,7 +1064,7 @@ static int rcu_preempt_pending(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, it never needs any CPU.
+ * Because preemptible RCU does not exist, it never needs any CPU.
 */
 static int rcu_preempt_needs_cpu(int cpu)
 {
@@ -968,7 +1072,7 @@ static int rcu_preempt_needs_cpu(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, rcu_barrier() is just
+ * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
 */
 void rcu_barrier(void)
@@ -978,7 +1082,7 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Because preemptable RCU does not exist, there is no per-CPU
+ * Because preemptible RCU does not exist, there is no per-CPU
 * data to initialize.
 */
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -986,14 +1090,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Because there is no preemptable RCU, there are no callbacks to move.
+ * Because there is no preemptible RCU, there are no callbacks to move.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 /*
- * Because preemptable RCU does not exist, it need not be initialized.
+ * Because preemptible RCU does not exist, it need not be initialized.
 */
 static void __init __rcu_init_preempt(void)
 {
@@ -1001,6 +1105,791 @@ static void __init __rcu_init_preempt(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
+#ifdef CONFIG_RCU_TRACE
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+        if (list_empty(&rnp->blkd_tasks))
+                rnp->n_balk_blkd_tasks++;
+        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
+                rnp->n_balk_exp_gp_tasks++;
+        else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
+                rnp->n_balk_boost_tasks++;
+        else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
+                rnp->n_balk_notblocked++;
+        else if (rnp->gp_tasks != NULL &&
+                 ULONG_CMP_LT(jiffies, rnp->boost_time))
+                rnp->n_balk_notyet++;
+        else
+                rnp->n_balk_nos++;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Carry out RCU priority boosting on the task indicated by ->exp_tasks
+ * or ->boost_tasks, advancing the pointer to the next task in the
+ * ->blkd_tasks list.
+ *
+ * Note that irqs must be enabled: boosting the task can block.
+ * Returns 1 if there are more tasks needing to be boosted.
+ */
+static int rcu_boost(struct rcu_node *rnp)
+{
+        unsigned long flags;
+        struct rt_mutex mtx;
+        struct task_struct *t;
+        struct list_head *tb;
+        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+                return 0;  /* Nothing left to boost. */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        /*
+         * Recheck under the lock: all tasks in need of boosting
+         * might exit their RCU read-side critical sections on their own.
+         */
+        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return 0;
+        }
+        /*
+         * Preferentially boost tasks blocking expedited grace periods.
+         * This cannot starve the normal grace periods because a second
+         * expedited grace period must boost all blocked tasks, including
+         * those blocking the pre-existing normal grace period.
+         */
+        if (rnp->exp_tasks != NULL) {
+                tb = rnp->exp_tasks;
+                rnp->n_exp_boosts++;
+        } else {
+                tb = rnp->boost_tasks;
+                rnp->n_normal_boosts++;
+        }
+        rnp->n_tasks_boosted++;
+        /*
+         * We boost task t by manufacturing an rt_mutex that appears to
+         * be held by task t.  We leave a pointer to that rt_mutex where
+         * task t can find it, and task t will release the mutex when it
+         * exits its outermost RCU read-side critical section.  Then
+         * simply acquiring this artificial rt_mutex will boost task
+         * t's priority.  (Thanks to tglx for suggesting this approach!)
+         *
+         * Note that task t must acquire rnp->lock to remove itself from
+         * the ->blkd_tasks list, which it will do from exit() if from
+         * nowhere else.  We therefore are guaranteed that task t will
+         * stay around at least until we drop rnp->lock.  Note that
+         * rnp->lock also resolves races between our priority boosting
+         * and task t's exiting its outermost RCU read-side critical
+         * section.
+         */
+        t = container_of(tb, struct task_struct, rcu_node_entry);
+        rt_mutex_init_proxy_locked(&mtx, t);
+        t->rcu_boost_mutex = &mtx;
+        t->rcu_boosted = 1;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
+        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
+        return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+}
+/*
+ * Timer handler to initiate waking up of boost kthreads that
+ * have yielded the CPU due to excessive numbers of tasks to
+ * boost.  We wake up the per-rcu_node kthread, which in turn
+ * will wake up the booster kthread.
+ */
+static void rcu_boost_kthread_timer(unsigned long arg)
+{
+        invoke_rcu_node_kthread((struct rcu_node *)arg);
+}
+/*
+ * Priority-boosting kthread.  One per leaf rcu_node and one for the
+ * root rcu_node.
+ */
+static int rcu_boost_kthread(void *arg)
+{
+        struct rcu_node *rnp = (struct rcu_node *)arg;
+        int spincnt = 0;
+        int more2boost;
+        for (;;) {
+                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+                more2boost = rcu_boost(rnp);
+                if (more2boost)
+                        spincnt++;
+                else
+                        spincnt = 0;
+                if (spincnt > 10) {
+                        rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+                        spincnt = 0;
+                }
+        }
+        /* NOTREACHED */
+        return 0;
+}
+/*
+ * Check to see if it is time to start boosting RCU readers that are
+ * blocking the current grace period, and, if so, tell the per-rcu_node
+ * kthread to start boosting them.  If there is an expedited grace
+ * period in progress, it is always time to boost.
+ *
+ * The caller must hold rnp->lock, which this function releases,
+ * but irqs remain disabled.  The ->boost_kthread_task is immortal,
+ * so we don't need to worry about it going away.
+ */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+        struct task_struct *t;
+        if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+                rnp->n_balk_exp_gp_tasks++;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        if (rnp->exp_tasks != NULL ||
+            (rnp->gp_tasks != NULL &&
+             rnp->boost_tasks == NULL &&
+             rnp->qsmask == 0 &&
+             ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+                if (rnp->exp_tasks == NULL)
+                        rnp->boost_tasks = rnp->gp_tasks;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                t = rnp->boost_kthread_task;
+                if (t != NULL)
+                        wake_up_process(t);
+        } else {
+                rcu_initiate_boost_trace(rnp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
+}
+/*
+ * Wake up the per-CPU kthread to invoke RCU callbacks.
+ */
+static void invoke_rcu_callbacks_kthread(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __this_cpu_write(rcu_cpu_has_work, 1);
+        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+                local_irq_restore(flags);
+                return;
+        }
+        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+        local_irq_restore(flags);
+}
+/*
+ * Set the affinity of the boost kthread.  The CPU-hotplug locks are
+ * held, so no one should be messing with the existence of the boost
+ * kthread.
+ */
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+                                          cpumask_var_t cm)
+{
+        struct task_struct *t;
+        t = rnp->boost_kthread_task;
+        if (t != NULL)
+                set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
+}
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+        rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+/*
+ * Create an RCU-boost kthread for the specified node if one does not
+ * already exist.  We only create this kthread for preemptible RCU.
+ * Returns zero if all is well, a negated errno otherwise.
+ */
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+                                                 struct rcu_node *rnp,
+                                                 int rnp_index)
+{
+        unsigned long flags;
+        struct sched_param sp;
+        struct task_struct *t;
+        if (&rcu_preempt_state != rsp)
+                return 0;
+        rsp->boost = 1;
+        if (rnp->boost_kthread_task != NULL)
+                return 0;
+        t = kthread_create(rcu_boost_kthread, (void *)rnp,
+                           "rcub%d", rnp_index);
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        rnp->boost_kthread_task = t;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Stop the RCU's per-CPU kthread when its CPU goes offline,.
+ */
+static void rcu_stop_cpu_kthread(int cpu)
+{
+        struct task_struct *t;
+        /* Stop the CPU's kthread. */
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t != NULL) {
+                per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+                kthread_stop(t);
+        }
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_kthread_do_work(void)
+{
+        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+        rcu_preempt_do_callbacks();
+}
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        t = rnp->node_kthread_task;
+        if (t != NULL)
+                wake_up_process(t);
+}
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument.  The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+        int policy;
+        struct sched_param sp;
+        struct task_struct *t;
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t == NULL)
+                return;
+        if (to_rt) {
+                policy = SCHED_FIFO;
+                sp.sched_priority = RCU_KTHREAD_PRIO;
+        } else {
+                policy = SCHED_NORMAL;
+                sp.sched_priority = 0;
+        }
+        sched_setscheduler_nocheck(t, policy, &sp);
+}
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+        struct rcu_node *rnp = rdp->mynode;
+        atomic_or(rdp->grpmask, &rnp->wakemask);
+        invoke_rcu_node_kthread(rnp);
+}
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted.  Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+{
+        struct sched_param sp;
+        struct timer_list yield_timer;
+        setup_timer_on_stack(&yield_timer, f, arg);
+        mod_timer(&yield_timer, jiffies + 2);
+        sp.sched_priority = 0;
+        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+        set_user_nice(current, 19);
+        schedule();
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+        del_timer(&yield_timer);
+}
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline.  We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh.  This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+        while (cpu_is_offline(cpu) ||
+               !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+               smp_processor_id() != cpu) {
+                if (kthread_should_stop())
+                        return 1;
+                per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+                per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+                local_bh_enable();
+                schedule_timeout_uninterruptible(1);
+                if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+                        set_cpus_allowed_ptr(current, cpumask_of(cpu));
+                local_bh_disable();
+        }
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        return 0;
+}
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+        int cpu = (int)(long)arg;
+        unsigned long flags;
+        int spincnt = 0;
+        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+        char work;
+        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+        for (;;) {
+                *statusp = RCU_KTHREAD_WAITING;
+                rcu_wait(*workp != 0 || kthread_should_stop());
+                local_bh_disable();
+                if (rcu_cpu_kthread_should_stop(cpu)) {
+                        local_bh_enable();
+                        break;
+                }
+                *statusp = RCU_KTHREAD_RUNNING;
+                per_cpu(rcu_cpu_kthread_loops, cpu)++;
+                local_irq_save(flags);
+                work = *workp;
+                *workp = 0;
+                local_irq_restore(flags);
+                if (work)
+                        rcu_kthread_do_work();
+                local_bh_enable();
+                if (*workp != 0)
+                        spincnt++;
+                else
+                        spincnt = 0;
+                if (spincnt > 10) {
+                        *statusp = RCU_KTHREAD_YIELDING;
+                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+                        spincnt = 0;
+                }
+        }
+        *statusp = RCU_KTHREAD_STOPPED;
+        return 0;
+}
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task.  There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ *
+ * Please note that we cannot simply refuse to wake up the per-CPU
+ * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
+ * which can result in softlockup complaints if the task ends up being
+ * idle for more than a couple of minutes.
+ *
+ * However, please note also that we cannot bind the per-CPU kthread to its
+ * CPU until that CPU is fully online.  We also cannot wait until the
+ * CPU is fully online before we create its per-CPU kthread, as this would
+ * deadlock the system when CPU notifiers tried waiting for grace
+ * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
+ * is online.  If its CPU is not yet fully online, then the code in
+ * rcu_cpu_kthread() will wait until it is fully online, and then do
+ * the binding.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+                return 0;
+        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        if (cpu_online(cpu))
+                kthread_bind(t, cpu);
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        per_cpu(rcu_cpu_kthread_task, cpu) = t;
+        wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed.  We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long mask;
+        struct rcu_node *rnp = (struct rcu_node *)arg;
+        struct sched_param sp;
+        struct task_struct *t;
+        for (;;) {
+                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+                rcu_wait(atomic_read(&rnp->wakemask) != 0);
+                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                mask = atomic_xchg(&rnp->wakemask, 0);
+                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+                        if ((mask & 0x1) == 0)
+                                continue;
+                        preempt_disable();
+                        t = per_cpu(rcu_cpu_kthread_task, cpu);
+                        if (!cpu_online(cpu) || t == NULL) {
+                                preempt_enable();
+                                continue;
+                        }
+                        per_cpu(rcu_cpu_has_work, cpu) = 1;
+                        sp.sched_priority = RCU_KTHREAD_PRIO;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                        preempt_enable();
+                }
+        }
+        /* NOTREACHED */
+        rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+        return 0;
+}
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question.  The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU.  If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+        cpumask_var_t cm;
+        int cpu;
+        unsigned long mask = rnp->qsmaskinit;
+        if (rnp->node_kthread_task == NULL)
+                return;
+        if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+                return;
+        cpumask_clear(cm);
+        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+                if ((mask & 0x1) && cpu != outgoingcpu)
+                        cpumask_set_cpu(cpu, cm);
+        if (cpumask_weight(cm) == 0) {
+                cpumask_setall(cm);
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+                        cpumask_clear_cpu(cpu, cm);
+                WARN_ON_ONCE(cpumask_weight(cm) == 0);
+        }
+        set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+        rcu_boost_kthread_setaffinity(rnp, cm);
+        free_cpumask_var(cm);
+}
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ * Called during boot before online/offline can happen, or, if
+ * during runtime, with the main CPU-hotplug locks held.  So only
+ * one of these can be executing at a time.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+                                                struct rcu_node *rnp)
+{
+        unsigned long flags;
+        int rnp_index = rnp - &rsp->node[0];
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            rnp->qsmaskinit == 0)
+                return 0;
+        if (rnp->node_kthread_task == NULL) {
+                t = kthread_create(rcu_node_kthread, (void *)rnp,
+                                   "rcun%d", rnp_index);
+                if (IS_ERR(t))
+                        return PTR_ERR(t);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                rnp->node_kthread_task = t;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                sp.sched_priority = 99;
+                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        }
+        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+        int cpu;
+        struct rcu_node *rnp;
+        rcu_scheduler_fully_active = 1;
+        for_each_possible_cpu(cpu) {
+                per_cpu(rcu_cpu_has_work, cpu) = 0;
+                if (cpu_online(cpu))
+                        (void)rcu_spawn_one_cpu_kthread(cpu);
+        }
+        rnp = rcu_get_root(rcu_state);
+        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        if (NUM_RCU_NODES > 1) {
+                rcu_for_each_leaf_node(rcu_state, rnp)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;
+        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+        if (rcu_scheduler_fully_active) {
+                (void)rcu_spawn_one_cpu_kthread(cpu);
+                if (rnp->node_kthread_task == NULL)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+static void invoke_rcu_callbacks_kthread(void)
+{
+        WARN_ON_ONCE(1);
+}
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void rcu_stop_cpu_kthread(int cpu)
+{
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+}
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+}
+static int __init rcu_scheduler_really_started(void)
+{
+        rcu_scheduler_fully_active = 1;
+        return 0;
+}
+early_initcall(rcu_scheduler_really_started);
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#ifndef CONFIG_SMP
+void synchronize_sched_expedited(void)
+{
+        cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#else /* #ifndef CONFIG_SMP */
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
+}
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+        int firstsnap, s, snap, trycount = 0;
+        /* Note that atomic_inc_return() implies full memory barrier. */
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+        get_online_cpus();
+        /*
+         * Each pass through the following loop attempts to force a
+         * context switch on each CPU.
+         */
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
+                put_online_cpus();
+                /* No joy, try again later.  Or just synchronize_sched(). */
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_sched();
+                        return;
+                }
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        return;
+                }
+                /*
+                 * Refetching sync_sched_expedited_started allows later
+                 * callers to piggyback on our grace period.  We subtract
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
+                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
+        }
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+        put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#endif /* #else #ifndef CONFIG_SMP */
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
@@ -1047,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
 *
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
- * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
+ * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
 */
 int rcu_needs_cpu(int cpu)
 {
        int c = 0;
        int snap;
-        int snap_nmi;
        int thatcpu;
        /* Check for being in the holdoff period. */
@@ -1065,10 +1953,10 @@ int rcu_needs_cpu(int cpu)
        for_each_online_cpu(thatcpu) {
                if (thatcpu == cpu)
                        continue;
-                snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
-                snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+                                                     thatcpu).dynticks);
                smp_mb(); /* Order sampling of snap with end of grace period. */
-                if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+                if ((snap & 0x1) != 0) {
                        per_cpu(rcu_dyntick_drain, cpu) = 0;
                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
                        return rcu_needs_cpu_quick_check(cpu);
@@ -1099,7 +1987,7 @@ int rcu_needs_cpu(int cpu)
        /* If RCU callbacks are still pending, RCU still needs this CPU. */
        if (c)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_core();
        return c;
 }
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..4e144876dc68 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,22 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+static char convert_kthread_status(unsigned int kthread_status)
+{
+        if (kthread_status > RCU_KTHREAD_MAX)
+                return '?';
+        return "SRWOY"[kthread_status];
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
@@ -57,14 +73,33 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
-        seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+        seq_printf(m, " dt=%d/%d/%d df=%lu",
-                   rdp->dynticks->dynticks,
+                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi,
+                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+        seq_printf(m, " ql=%ld qs=%c%c%c%c",
+                   rdp->qlen,
+                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_TAIL]],
+                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
+                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+                        rdp->nxttail[RCU_WAIT_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, " kt=%d/%c/%d ktl=%x",
+                   per_cpu(rcu_cpu_has_work, rdp->cpu),
+                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+                                          rdp->cpu)),
+                   per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
+                   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, " b=%ld", rdp->blimit);
+        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 #define PRINT_RCU_DATA(name, func, m) \
@@ -113,22 +148,42 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
        seq_printf(m, ",%d,%d,%d,%lu",
-                   rdp->dynticks->dynticks,
+                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi,
+                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
+                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_TAIL]],
+                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
+                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+                        rdp->nxttail[RCU_WAIT_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, ",%d,\"%c\"",
+                   per_cpu(rcu_cpu_has_work, rdp->cpu),
+                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+                                          rdp->cpu)));
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, ",%ld", rdp->blimit);
+        seq_printf(m, ",%lu,%lu,%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
-        seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
+#ifdef CONFIG_RCU_BOOST
+        seq_puts(m, "\"kt\",\"ktl\"");
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
        seq_puts(m, "\"rcu_preempt:\"\n");
        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -153,34 +208,97 @@ static const struct file_operations rcudata_csv_fops = {
        .release = single_release,
 };
+#ifdef CONFIG_RCU_BOOST
+static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
+{
+        seq_printf(m,  "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+                   "j=%04x bt=%04x\n",
+                   rnp->grplo, rnp->grphi,
+                   "T."[list_empty(&rnp->blkd_tasks)],
+                   "N."[!rnp->gp_tasks],
+                   "E."[!rnp->exp_tasks],
+                   "B."[!rnp->boost_tasks],
+                   convert_kthread_status(rnp->boost_kthread_status),
+                   rnp->n_tasks_boosted, rnp->n_exp_boosts,
+                   rnp->n_normal_boosts,
+                   (int)(jiffies & 0xffff),
+                   (int)(rnp->boost_time & 0xffff));
+        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+                   "     balk",
+                   rnp->n_balk_blkd_tasks,
+                   rnp->n_balk_exp_gp_tasks,
+                   rnp->n_balk_boost_tasks,
+                   rnp->n_balk_notblocked,
+                   rnp->n_balk_notyet,
+                   rnp->n_balk_nos);
+}
+static int show_rcu_node_boost(struct seq_file *m, void *unused)
+{
+        struct rcu_node *rnp;
+        rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
+                print_one_rcu_node_boost(m, rnp);
+        return 0;
+}
+static int rcu_node_boost_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcu_node_boost, NULL);
+}
+static const struct file_operations rcu_node_boost_fops = {
+        .owner = THIS_MODULE,
+        .open = rcu_node_boost_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+/*
+ * Create the rcuboost debugfs entry.  Standard error return.
+ */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+        return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
+                                    &rcu_node_boost_fops);
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+        return 0;  /* There cannot be an error if we didn't create it! */
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
        unsigned long gpnum;
        int level = 0;
-        int phase;
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh, rsp->orphan_qlen);
+                   rsp->n_force_qs_lh);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
                        level = rnp->level;
                }
-                phase = gpnum & 0x1;
+                seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d    ",
-                seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d    ",
                           rnp->qsmask, rnp->qsmaskinit,
-                           "T."[list_empty(&rnp->blocked_tasks[phase])],
+                           ".G"[rnp->gp_tasks != NULL],
-                           "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
+                           ".E"[rnp->exp_tasks != NULL],
-                           "T."[list_empty(&rnp->blocked_tasks[!phase])],
+                           ".T"[!list_empty(&rnp->blkd_tasks)],
-                           "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
                           rnp->grplo, rnp->grphi, rnp->grpnum);
        }
        seq_puts(m, "\n");
@@ -212,16 +330,35 @@ static const struct file_operations rcuhier_fops = {
        .release = single_release,
 };
+static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
+{
+        unsigned long flags;
+        unsigned long completed;
+        unsigned long gpnum;
+        unsigned long gpage;
+        unsigned long gpmax;
+        struct rcu_node *rnp = &rsp->node[0];
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        completed = rsp->completed;
+        gpnum = rsp->gpnum;
+        if (rsp->completed == rsp->gpnum)
+                gpage = 0;
+        else
+                gpage = jiffies - rsp->gp_start;
+        gpmax = rsp->gp_max;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        seq_printf(m, "%s: completed=%ld  gpnum=%lu  age=%ld  max=%ld\n",
+                   rsp->name, completed, gpnum, gpage, gpmax);
+}
 static int show_rcugp(struct seq_file *m, void *unused)
 {
 #ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%lu\n",
+        show_one_rcugp(m, &rcu_preempt_state);
-                   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%lu\n",
+        show_one_rcugp(m, &rcu_sched_state);
-                   rcu_sched_state.completed, rcu_sched_state.gpnum);
+        show_one_rcugp(m, &rcu_bh_state);
-        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%lu\n",
-                   rcu_bh_state.completed, rcu_bh_state.gpnum);
        return 0;
 }
@@ -262,7 +399,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_data *rdp;
        for_each_possible_cpu(cpu) {
-                rdp = rsp->rda[cpu];
+                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (rdp->beenonline)
                        print_one_rcu_pending(m, rdp);
        }
@@ -294,9 +431,32 @@ static const struct file_operations rcu_pending_fops = {
        .release = single_release,
 };
+static int show_rcutorture(struct seq_file *m, void *unused)
+{
+        seq_printf(m, "rcutorture test sequence: %lu %s\n",
+                   rcutorture_testseq >> 1,
+                   (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
+        seq_printf(m, "rcutorture update version number: %lu\n",
+                   rcutorture_vernum);
+        return 0;
+}
+static int rcutorture_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcutorture, NULL);
+}
+static const struct file_operations rcutorture_fops = {
+        .owner = THIS_MODULE,
+        .open = rcutorture_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
 static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
        struct dentry *retval;
@@ -314,6 +474,9 @@ static int __init rcuclassic_trace_init(void)
        if (!retval)
                goto free_out;
+        if (rcu_boost_trace_create_file(rcudir))
+                goto free_out;
        retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
        if (!retval)
                goto free_out;
@@ -327,20 +490,25 @@ static int __init rcuclassic_trace_init(void)
                                                NULL, &rcu_pending_fops);
        if (!retval)
                goto free_out;
+        retval = debugfs_create_file("rcutorture", 0444, rcudir,
+                                                NULL, &rcutorture_fops);
+        if (!retval)
+                goto free_out;
        return 0;
 free_out:
        debugfs_remove_recursive(rcudir);
        return 1;
 }
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
        debugfs_remove_recursive(rcudir);
 }
-module_init(rcuclassic_trace_init);
+module_init(rcutree_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_exit(rcutree_trace_cleanup);
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
 */
 static struct page **relay_alloc_page_array(unsigned int n_pages)
 {
-        struct page **array;
+        const size_t pa_size = n_pages * sizeof(struct page *);
-        size_t pa_size = n_pages * sizeof(struct page *);
+        if (pa_size > PAGE_SIZE)
+                return vzalloc(pa_size);
-        if (pa_size > PAGE_SIZE) {
+        return kzalloc(pa_size, GFP_KERNEL);
-                array = vmalloc(pa_size);
-                if (array)
-                        memset(array, 0, pa_size);
-        } else {
-                array = kzalloc(pa_size, GFP_KERNEL);
-        }
-        return array;
 }
 /*
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
                        pos, buf, s - buf);
 }
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+        unsigned long flags;
+        u64 ret;
+        spin_lock_irqsave(&counter->lock, flags);
+        ret = *res_counter_member(counter, member);
+        spin_unlock_irqrestore(&counter->lock, flags);
+        return ret;
+}
+#else
 u64 res_counter_read_u64(struct res_counter *counter, int member)
 {
        return *res_counter_member(counter, member);
 }
+#endif
 int res_counter_memparse_write_strategy(const char *buf,
                                        unsigned long long *res)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..3ff40178dce7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
 };
 EXPORT_SYMBOL(iomem_resource);
+/* constraints to be met while allocating resources */
+struct resource_constraint {
+        resource_size_t min, max, align;
+        resource_size_t (*alignf)(void *, const struct resource *,
+                        resource_size_t, resource_size_t);
+        void *alignf_data;
+};
 static DEFINE_RWLOCK(resource_lock);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -357,57 +365,148 @@ int __weak page_is_ram(unsigned long pfn)
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
+static resource_size_t simple_align_resource(void *data,
+                                             const struct resource *avail,
+                                             resource_size_t size,
+                                             resource_size_t align)
+{
+        return avail->start;
+}
+static void resource_clip(struct resource *res, resource_size_t min,
+                          resource_size_t max)
+{
+        if (res->start < min)
+                res->start = min;
+        if (res->end > max)
+                res->end = max;
+}
+static bool resource_contains(struct resource *res1, struct resource *res2)
+{
+        return res1->start <= res2->start && res1->end >= res2->end;
+}
 /*
- * Find empty slot in the resource tree given range and alignment.
+ * Find empty slot in the resource tree with the given range and
+ * alignment constraints
 */
-static int find_resource(struct resource *root, struct resource *new,
+static int __find_resource(struct resource *root, struct resource *old,
-                         resource_size_t size, resource_size_t min,
+                         struct resource *new,
-                         resource_size_t max, resource_size_t align,
+                         resource_size_t  size,
-                         resource_size_t (*alignf)(void *,
+                         struct resource_constraint *constraint)
-                                                   const struct resource *,
-                                                   resource_size_t,
-                                                   resource_size_t),
-                         void *alignf_data)
 {
        struct resource *this = root->child;
-        struct resource tmp = *new;
+        struct resource tmp = *new, avail, alloc;
+        tmp.flags = new->flags;
        tmp.start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to tmp->end below would cause an underflow.
         */
-        if (this && this->start == 0) {
+        if (this && this->start == root->start) {
-                tmp.start = this->end + 1;
+                tmp.start = (this == old) ? old->start : this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        tmp.end = this->start - 1;
+                        tmp.end = (this == old) ?  this->end : this->start - 1;
                else
                        tmp.end = root->end;
-                if (tmp.start < min)
-                        tmp.start = min;
+                resource_clip(&tmp, constraint->min, constraint->max);
-                if (tmp.end > max)
+                arch_remove_reservations(&tmp);
-                        tmp.end = max;
-                tmp.start = ALIGN(tmp.start, align);
+                /* Check for overflow after ALIGN() */
-                if (alignf)
+                avail = *new;
-                        tmp.start = alignf(alignf_data, &tmp, size, align);
+                avail.start = ALIGN(tmp.start, constraint->align);
-                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
+                avail.end = tmp.end;
-                        new->start = tmp.start;
+                if (avail.start >= tmp.start) {
-                        new->end = tmp.start + size - 1;
+                        alloc.start = constraint->alignf(constraint->alignf_data, &avail,
-                        return 0;
+                                        size, constraint->align);
+                        alloc.end = alloc.start + size - 1;
+                        if (resource_contains(&avail, &alloc)) {
+                                new->start = alloc.start;
+                                new->end = alloc.end;
+                                return 0;
+                        }
                }
                if (!this)
                        break;
-                tmp.start = this->end + 1;
+                if (this != old)
+                        tmp.start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
 }
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+                        resource_size_t size,
+                        struct resource_constraint  *constraint)
+{
+        return  __find_resource(root, NULL, new, size, constraint);
+}
+/**
+ * reallocate_resource - allocate a slot in the resource tree given range & alignment.
+ *      The resource will be relocated if the new size cannot be reallocated in the
+ *      current location.
+ *
+ * @root: root resource descriptor
+ * @old:  resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+int reallocate_resource(struct resource *root, struct resource *old,
+                        resource_size_t newsize,
+                        struct resource_constraint  *constraint)
+{
+        int err=0;
+        struct resource new = *old;
+        struct resource *conflict;
+        write_lock(&resource_lock);
+        if ((err = __find_resource(root, old, &new, newsize, constraint)))
+                goto out;
+        if (resource_contains(&new, old)) {
+                old->start = new.start;
+                old->end = new.end;
+                goto out;
+        }
+        if (old->child) {
+                err = -EBUSY;
+                goto out;
+        }
+        if (resource_contains(old, &new)) {
+                old->start = new.start;
+                old->end = new.end;
+        } else {
+                __release_resource(old);
+                *old = new;
+                conflict = __request_resource(root, old);
+                BUG_ON(conflict);
+        }
+out:
+        write_unlock(&resource_lock);
+        return err;
+}
 /**
- * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment.
+ *      The resource will be reallocated with a new size if it was already allocated
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 * @size: requested resource region size
@@ -427,9 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
                      void *alignf_data)
 {
        int err;
+        struct resource_constraint constraint;
+        if (!alignf)
+                alignf = simple_align_resource;
+        constraint.min = min;
+        constraint.max = max;
+        constraint.align = align;
+        constraint.alignf = alignf;
+        constraint.alignf_data = alignf_data;
+        if ( new->parent ) {
+                /* resource is already allocated, try reallocating with
+                   the new constraints */
+                return reallocate_resource(root, new, size, &constraint);
+        }
        write_lock(&resource_lock);
-        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        err = find_resource(root, new, size, &constraint);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
@@ -453,6 +568,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
                if (first == parent)
                        return first;
+                if (WARN_ON(first == new))      /* duplicated insertion */
+                        return first;
                if ((first->start > new->start) || (first->end < new->end))
                        break;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
        put_pid(waiter->deadlock_task_pid);
        TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
        TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-        TRACE_WARN_ON(waiter->task);
        memset(waiter, 0x22, sizeof(*waiter));
 }
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
        int                     opcode;
        int                     opdata;
        int                     mutexes[MAX_RT_TEST_MUTEXES];
-        int                     bkl;
        int                     event;
        struct sys_device       sysdev;
 };
@@ -46,9 +44,8 @@ enum test_opcodes {
        RTTEST_LOCKINTNOWAIT,   /* 6 Lock interruptible no wait in wakeup, data = lockindex */
        RTTEST_LOCKCONT,        /* 7 Continue locking after the wakeup delay */
        RTTEST_UNLOCK,          /* 8 Unlock, data = lockindex */
-        RTTEST_LOCKBKL,         /* 9 Lock BKL */
+        /* 9, 10 - reserved for BKL commemoration */
-        RTTEST_UNLOCKBKL,       /* 10 Unlock BKL */
+        RTTEST_SIGNAL = 11,     /* 11 Signal other test thread, data = thread id */
-        RTTEST_SIGNAL,          /* 11 Signal other test thread, data = thread id */
        RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
        RTTEST_RESET = 99,      /* 99 Reset all pending operations */
 };
@@ -74,11 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
                                td->mutexes[i] = 0;
                        }
                }
-                if (!lockwakeup && td->bkl == 4) {
-                        unlock_kernel();
-                        td->bkl = 0;
-                }
                return 0;
        case RTTEST_RESETEVENT:
@@ -129,21 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
                td->mutexes[id] = 0;
                return 0;
-        case RTTEST_LOCKBKL:
-                if (td->bkl)
-                        return 0;
-                td->bkl = 1;
-                lock_kernel();
-                td->bkl = 4;
-                return 0;
-        case RTTEST_UNLOCKBKL:
-                if (td->bkl != 4)
-                        break;
-                unlock_kernel();
-                td->bkl = 0;
-                return 0;
        default:
                break;
        }
@@ -190,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
                td->event = atomic_add_return(1, &rttest_event);
                break;
-        case RTTEST_LOCKBKL:
        default:
                break;
        }
@@ -223,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
                td->event = atomic_add_return(1, &rttest_event);
                return;
-        case RTTEST_LOCKBKL:
-                return;
        default:
                return;
        }
@@ -374,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
        spin_lock(&rttest_lock);
        curr += sprintf(curr,
-                "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+                "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
                td->opcode, td->event, tsk->state,
                        (MAX_RT_PRIO - 1) - tsk->prio,
                        (MAX_RT_PRIO - 1) - tsk->normal_prio,
-                tsk->pi_blocked_on, td->bkl);
+                tsk->pi_blocked_on);
        for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
                curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
 /*
 * lock->owner state tracking:
 *
- * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
- * are used to keep track of the "owner is pending" and "lock has
+ * is used to keep track of the "lock has waiters" state.
- * waiters" state.
 *
- * owner        bit1    bit0
+ * owner        bit0
- * NULL         0       0       lock is free (fast acquire possible)
+ * NULL         0       lock is free (fast acquire possible)
- * NULL         0       1       invalid state
+ * NULL         1       lock is free and has waiters and the top waiter
- * NULL         1       0       Transitional State*
+ *                              is going to take the lock*
- * NULL         1       1       invalid state
+ * taskpointer  0       lock is held (fast release possible)
- * taskpointer  0       0       lock is held (fast release possible)
+ * taskpointer  1       lock is held and has waiters**
- * taskpointer  0       1       task is pending owner
- * taskpointer  1       0       lock is held and has waiters
- * taskpointer  1       1       task is pending owner and lock has more waiters
- *
- * Pending ownership is assigned to the top (highest priority)
- * waiter of the lock, when the lock is released. The thread is woken
- * up and can now take the lock. Until the lock is taken (bit 0
- * cleared) a competing higher priority thread can steal the lock
- * which puts the woken up thread back on the waiters list.
 *
 * The fast atomic compare exchange based acquire and release is only
- * possible when bit 0 and 1 of lock->owner are 0.
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
 *
- * (*) There's a small time where the owner can be NULL and the
+ * (**) There is a small time when bit 0 is set but there are no
- * "lock has waiters" bit is set.  This can happen when grabbing the lock.
+ * waiters. This can happen when grabbing the lock in the slow path.
- * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
- * bit before looking at the lock, hence the reason this is a transitional
+ * set this bit before looking at the lock.
- * state.
 */
 static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
-                   unsigned long mask)
 {
-        unsigned long val = (unsigned long)owner | mask;
+        unsigned long val = (unsigned long)owner;
        if (rt_mutex_has_waiters(lock))
                val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * reached or the state of the chain has changed while we
         * dropped the locks.
         */
-        if (!waiter || !waiter->task)
+        if (!waiter)
                goto out_unlock_pi;
        /*
         * Check the orig_waiter state. After we dropped the locks,
-         * the previous owner of the lock might have released the lock
+         * the previous owner of the lock might have released the lock.
-         * and made us the pending owner:
         */
-        if (orig_waiter && !orig_waiter->task)
+        if (orig_waiter && !rt_mutex_owner(orig_lock))
                goto out_unlock_pi;
        /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /* Release the task */
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        if (!rt_mutex_owner(lock)) {
+                /*
+                 * If the requeue above changed the top waiter, then we need
+                 * to wake the new top waiter up to try to get the lock.
+                 */
+                if (top_waiter != rt_mutex_top_waiter(lock))
+                        wake_up_process(rt_mutex_top_waiter(lock)->task);
+                raw_spin_unlock(&lock->wait_lock);
+                goto out_put_task;
+        }
        put_task_struct(task);
        /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 }
 /*
- * Optimization: check if we can steal the lock from the
- * assigned pending owner [which might not have taken the
- * lock yet]:
- */
-static inline int try_to_steal_lock(struct rt_mutex *lock,
-                                    struct task_struct *task)
-{
-        struct task_struct *pendowner = rt_mutex_owner(lock);
-        struct rt_mutex_waiter *next;
-        unsigned long flags;
-        if (!rt_mutex_owner_pending(lock))
-                return 0;
-        if (pendowner == task)
-                return 1;
-        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-        if (task->prio >= pendowner->prio) {
-                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-                return 0;
-        }
-        /*
-         * Check if a waiter is enqueued on the pending owners
-         * pi_waiters list. Remove it and readjust pending owners
-         * priority.
-         */
-        if (likely(!rt_mutex_has_waiters(lock))) {
-                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-                return 1;
-        }
-        /* No chain handling, pending owner is not blocked on anything: */
-        next = rt_mutex_top_waiter(lock);
-        plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
-        __rt_mutex_adjust_prio(pendowner);
-        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-        /*
-         * We are going to steal the lock and a waiter was
-         * enqueued on the pending owners pi_waiters queue. So
-         * we have to enqueue this waiter into
-         * task->pi_waiters list. This covers the case,
-         * where task is boosted because it holds another
-         * lock and gets unboosted because the booster is
-         * interrupted, so we would delay a waiter with higher
-         * priority as task->normal_prio.
-         *
-         * Note: in the rare case of a SCHED_OTHER task changing
-         * its priority and thus stealing the lock, next->task
-         * might be task:
-         */
-        if (likely(next->task != task)) {
-                raw_spin_lock_irqsave(&task->pi_lock, flags);
-                plist_add(&next->pi_list_entry, &task->pi_waiters);
-                __rt_mutex_adjust_prio(task);
-                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-        }
-        return 1;
-}
-/*
 * Try to take an rt-mutex
 *
- * This fails
- * - when the lock has a real owner
- * - when a different pending owner exists and has higher priority than current
- *
 * Must be called with lock->wait_lock held.
+ *
+ * @lock:   the lock to be acquired.
+ * @task:   the task which wants to acquire the lock
+ * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
 */
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+                struct rt_mutex_waiter *waiter)
 {
        /*
         * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
         */
        mark_rt_mutex_waiters(lock);
-        if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+        if (rt_mutex_owner(lock))
                return 0;
+        /*
+         * It will get the lock because of one of these conditions:
+         * 1) there is no waiter
+         * 2) higher priority than waiters
+         * 3) it is top waiter
+         */
+        if (rt_mutex_has_waiters(lock)) {
+                if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+                        if (!waiter || waiter != rt_mutex_top_waiter(lock))
+                                return 0;
+                }
+        }
+        if (waiter || rt_mutex_has_waiters(lock)) {
+                unsigned long flags;
+                struct rt_mutex_waiter *top;
+                raw_spin_lock_irqsave(&task->pi_lock, flags);
+                /* remove the queued waiter. */
+                if (waiter) {
+                        plist_del(&waiter->list_entry, &lock->wait_list);
+                        task->pi_blocked_on = NULL;
+                }
+                /*
+                 * We have to enqueue the top waiter(if it exists) into
+                 * task->pi_waiters list.
+                 */
+                if (rt_mutex_has_waiters(lock)) {
+                        top = rt_mutex_top_waiter(lock);
+                        top->pi_list_entry.prio = top->list_entry.prio;
+                        plist_add(&top->pi_list_entry, &task->pi_waiters);
+                }
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        }
        /* We got the lock. */
        debug_rt_mutex_lock(lock);
-        rt_mutex_set_owner(lock, current, 0);
+        rt_mutex_set_owner(lock, task);
-        rt_mutex_deadlock_account_lock(lock, current);
+        rt_mutex_deadlock_account_lock(lock, task);
        return 1;
 }
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        if (!owner)
+                return 0;
        if (waiter == rt_mutex_top_waiter(lock)) {
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
 * Wake up the next waiter on the lock.
 *
- * Remove the top waiter from the current tasks waiter list and from
+ * Remove the top waiter from the current tasks waiter list and wake it up.
- * the lock waiter list. Set it as pending owner. Then wake it up.
 *
 * Called with lock->wait_lock held.
 */
 static void wakeup_next_waiter(struct rt_mutex *lock)
 {
        struct rt_mutex_waiter *waiter;
-        struct task_struct *pendowner;
        unsigned long flags;
        raw_spin_lock_irqsave(&current->pi_lock, flags);
        waiter = rt_mutex_top_waiter(lock);
-        plist_del(&waiter->list_entry, &lock->wait_list);
        /*
         * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         * lock->wait_lock.
         */
        plist_del(&waiter->pi_list_entry, &current->pi_waiters);
-        pendowner = waiter->task;
-        waiter->task = NULL;
-        rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+        rt_mutex_set_owner(lock, NULL);
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
-        /*
+        wake_up_process(waiter->task);
-         * Clear the pi_blocked_on variable and enqueue a possible
-         * waiter into the pi_waiters list of the pending owner. This
-         * prevents that in case the pending owner gets unboosted a
-         * waiter with higher priority than pending-owner->normal_prio
-         * is blocked on the unboosted (pending) owner.
-         */
-        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-        WARN_ON(!pendowner->pi_blocked_on);
-        WARN_ON(pendowner->pi_blocked_on != waiter);
-        WARN_ON(pendowner->pi_blocked_on->lock != lock);
-        pendowner->pi_blocked_on = NULL;
-        if (rt_mutex_has_waiters(lock)) {
-                struct rt_mutex_waiter *next;
-                next = rt_mutex_top_waiter(lock);
-                plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
-        }
-        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-        wake_up_process(pendowner);
 }
 /*
- * Remove a waiter from a lock
+ * Remove a waiter from a lock and give up
 *
- * Must be called with lock->wait_lock held
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
 */
 static void remove_waiter(struct rt_mutex *lock,
                          struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
        raw_spin_lock_irqsave(&current->pi_lock, flags);
        plist_del(&waiter->list_entry, &lock->wait_list);
-        waiter->task = NULL;
        current->pi_blocked_on = NULL;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
-        if (first && owner != current) {
+        if (!owner)
+                return;
+        if (first) {
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 *                       or TASK_UNINTERRUPTIBLE)
 * @timeout:             the pre-initialized and started timer, or NULL for none
 * @waiter:              the pre-initialized rt_mutex_waiter
- * @detect_deadlock:     passed to task_blocks_on_rt_mutex
 *
 * lock->wait_lock must be held by the caller.
 */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                    struct hrtimer_sleeper *timeout,
-                    struct rt_mutex_waiter *waiter,
+                    struct rt_mutex_waiter *waiter)
-                    int detect_deadlock)
 {
        int ret = 0;
        for (;;) {
                /* Try to acquire the lock: */
-                if (try_to_take_rt_mutex(lock))
+                if (try_to_take_rt_mutex(lock, current, waiter))
                        break;
                /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                                break;
                }
-                /*
-                 * waiter->task is NULL the first time we come here and
-                 * when we have been woken up by the previous owner
-                 * but the lock got stolen by a higher prio task.
-                 */
-                if (!waiter->task) {
-                        ret = task_blocks_on_rt_mutex(lock, waiter, current,
-                                                      detect_deadlock);
-                        /*
-                         * If we got woken up by the owner then start loop
-                         * all over without going into schedule to try
-                         * to get the lock now:
-                         */
-                        if (unlikely(!waiter->task)) {
-                                /*
-                                 * Reset the return value. We might
-                                 * have returned with -EDEADLK and the
-                                 * owner released the lock while we
-                                 * were walking the pi chain.
-                                 */
-                                ret = 0;
-                                continue;
-                        }
-                        if (unlikely(ret))
-                                break;
-                }
                raw_spin_unlock(&lock->wait_lock);
                debug_rt_mutex_print_deadlock(waiter);
-                if (waiter->task)
+                schedule_rt_mutex(lock);
-                        schedule_rt_mutex(lock);
                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        int ret = 0;
        debug_rt_mutex_init_waiter(&waiter);
-        waiter.task = NULL;
        raw_spin_lock(&lock->wait_lock);
        /* Try to acquire the lock again: */
-        if (try_to_take_rt_mutex(lock)) {
+        if (try_to_take_rt_mutex(lock, current, NULL)) {
                raw_spin_unlock(&lock->wait_lock);
                return 0;
        }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                        timeout->task = NULL;
        }
-        ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+        ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
-                                  detect_deadlock);
+        if (likely(!ret))
+                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
        set_current_state(TASK_RUNNING);
-        if (unlikely(waiter.task))
+        if (unlikely(ret))
                remove_waiter(lock, &waiter);
        /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        if (unlikely(timeout))
                hrtimer_cancel(&timeout->timer);
-        /*
-         * Readjust priority, when we did not get the lock. We might
-         * have been the pending owner and boosted. Since we did not
-         * take the lock, the PI boost has to go.
-         */
-        if (unlikely(ret))
-                rt_mutex_adjust_prio(current);
        debug_rt_mutex_free_waiter(&waiter);
        return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
        if (likely(rt_mutex_owner(lock) != current)) {
-                ret = try_to_take_rt_mutex(lock);
+                ret = try_to_take_rt_mutex(lock, current, NULL);
                /*
                 * try_to_take_rt_mutex() sets the lock waiters
                 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 {
        __rt_mutex_init(lock, NULL);
        debug_rt_mutex_proxy_lock(lock, proxy_owner);
-        rt_mutex_set_owner(lock, proxy_owner, 0);
+        rt_mutex_set_owner(lock, proxy_owner);
        rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                           struct task_struct *proxy_owner)
 {
        debug_rt_mutex_proxy_unlock(lock);
-        rt_mutex_set_owner(lock, NULL, 0);
+        rt_mutex_set_owner(lock, NULL);
        rt_mutex_deadlock_account_unlock(proxy_owner);
 }
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        raw_spin_lock(&lock->wait_lock);
-        mark_rt_mutex_waiters(lock);
+        if (try_to_take_rt_mutex(lock, task, NULL)) {
-        if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
-                /* We got the lock for task. */
-                debug_rt_mutex_lock(lock);
-                rt_mutex_set_owner(lock, task, 0);
                raw_spin_unlock(&lock->wait_lock);
-                rt_mutex_deadlock_account_lock(lock, task);
                return 1;
        }
        ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
-        if (ret && !waiter->task) {
+        if (ret && !rt_mutex_owner(lock)) {
                /*
                 * Reset the return value. We might have
                 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                 */
                ret = 0;
        }
+        if (unlikely(ret))
+                remove_waiter(lock, waiter);
        raw_spin_unlock(&lock->wait_lock);
        debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        set_current_state(TASK_INTERRUPTIBLE);
-        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-                                  detect_deadlock);
        set_current_state(TASK_RUNNING);
-        if (unlikely(waiter->task))
+        if (unlikely(ret))
                remove_waiter(lock, waiter);
        /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        /*
-         * Readjust priority, when we did not get the lock. We might have been
-         * the pending owner and boosted. Since we did not take the lock, the
-         * PI boost has to go.
-         */
-        if (unlikely(ret))
-                rt_mutex_adjust_prio(current);
        return ret;
 }
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
 /*
 * lock->owner state tracking:
 */
-#define RT_MUTEX_OWNER_PENDING  1UL
+#define RT_MUTEX_HAS_WAITERS    1UL
-#define RT_MUTEX_HAS_WAITERS    2UL
+#define RT_MUTEX_OWNER_MASKALL  1UL
-#define RT_MUTEX_OWNER_MASKALL  3UL
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
                ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
 }
-static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
-{
-        return (struct task_struct *)
-                ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
-}
-static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
-{
-        return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
-}
 /*
 * PI-futex support (proxy locking functions, etc.):
 */
diff --git a/kernel/sched.c b/kernel/sched.c
index c5d775079027..935f8e8e6160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
@@ -75,9 +74,11 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 #include <litmus/sched_trace.h>
 #include <litmus/trace.h>
@@ -235,7 +236,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 #endif
 /*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
 * detach_destroy_domains and partition_sched_domains.
 */
 static DEFINE_MUTEX(sched_domains_mutex);
@@ -258,6 +259,8 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
        unsigned long shares;
+        atomic_t load_weight;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -273,25 +276,18 @@ struct task_group {
        struct task_group *parent;
        struct list_head siblings;
        struct list_head children;
-};
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
+};
-/* task_group_lock serializes add/remove of task groups and also changes to
+/* task_group_lock serializes the addition/removal of task groups */
- * a task group's cpu shares.
- */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
+# define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
-static int root_task_group_empty(void)
-{
-        return list_empty(&root_task_group.children);
-}
-#endif
-# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -301,16 +297,16 @@ static int root_task_group_empty(void)
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
-#define MIN_SHARES      2
+#define MIN_SHARES      (1UL <<  1)
 #define MAX_SHARES      (1UL << 18)
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
 /* Default task group.
 *      Every task in system belong to this group at bootup.
 */
-struct task_group init_task_group;
+struct task_group root_task_group;
 #endif  /* CONFIG_CGROUP_SCHED */
@@ -321,6 +317,9 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -332,9 +331,11 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next, *last;
+        struct sched_entity *curr, *next, *last, *skip;
+#ifdef  CONFIG_SCHED_DEBUG
        unsigned int nr_spread_over;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -347,6 +348,7 @@ struct cfs_rq {
         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
         * list is used during load balance.
         */
+        int on_list;
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
@@ -365,14 +367,17 @@ struct cfs_rq {
        unsigned long h_load;
        /*
-         * this cpu's part of tg->shares
+         * Maintaining per-cpu shares distribution for group scheduling
+         *
+         * load_stamp is the last time we updated the load average
+         * load_last is the last time we updated the load average and saw load
+         * load_unacc_exec_time is currently unaccounted execution time
         */
-        unsigned long shares;
+        u64 load_avg;
+        u64 load_period;
+        u64 load_stamp, load_last, load_unacc_exec_time;
-        /*
+        unsigned long load_contribution;
-         * load.weight at the time we set shares
-         */
-        unsigned long rq_weight;
 #endif
 #endif
 };
@@ -428,6 +433,7 @@ struct litmus_rq {
 */
 struct root_domain {
        atomic_t refcount;
+        struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
@@ -437,9 +443,7 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        atomic_t rto_count;
-#ifdef CONFIG_SMP
        struct cpupri cpupri;
-#endif
 };
 /*
@@ -448,7 +452,7 @@ struct root_domain {
 */
 static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
 /*
 * This is the main, per-CPU runqueue data structure.
@@ -473,7 +477,7 @@ struct rq {
        u64 nohz_stamp;
        unsigned char nohz_balance_kick;
 #endif
-        unsigned int skip_clock_update;
+        int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
@@ -500,11 +504,12 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle;
+        struct task_struct *curr, *idle, *stop;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
        u64 clock;
+        u64 clock_task;
        atomic_t nr_iowait;
@@ -532,6 +537,10 @@ struct rq {
        u64 avg_idle;
 #endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
        long calc_load_active;
@@ -561,32 +570,17 @@ struct rq {
        /* try_to_wake_up() stats */
        unsigned int ttwu_count;
        unsigned int ttwu_local;
+#endif
-        /* BKL stats */
+#ifdef CONFIG_SMP
-        unsigned int bkl_count;
+        struct task_struct *wake_list;
 #endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-        /*
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-         * A queue event has occurred, and we're going to schedule.  In
-         * this case, we can save a useless back to back clock update.
-         */
-        /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36;
-         * the scheduler can "forget" to renable the runqueue clock in some
-         * cases. LITMUS^RT amplifies the effects of this problem. Hence, we
-         * turn it off to avoid stalling clocks. */
-        /*
-        if (test_tsk_need_resched(p))
-                rq->skip_clock_update = 1;
-        */
-}
 static inline int cpu_of(struct rq *rq)
 {
@@ -599,7 +593,7 @@ static inline int cpu_of(struct rq *rq)
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
-                              rcu_read_lock_sched_held() || \
+                              rcu_read_lock_held() || \
                              lockdep_is_held(&sched_domains_mutex))
 /*
@@ -623,18 +617,22 @@ static inline int cpu_of(struct rq *rq)
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification
+ * We use task_subsys_state_check() and extend the RCU verification with
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * holds that lock for each task it moves into the cgroup. Therefore
+ * task it moves into the cgroup. Therefore by holding either of those locks,
- * by holding that lock, we pin the task to the current cgroup.
+ * we pin the task to the current cgroup.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+        struct task_group *tg;
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                        lockdep_is_held(&p->pi_lock) ||
                        lockdep_is_held(&task_rq(p)->lock));
-        return container_of(css, struct task_group, css);
+        tg = container_of(css, struct task_group, css);
+        return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -661,10 +659,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+static void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update)
+        s64 delta;
-                rq->clock = sched_clock_cpu(cpu_of(rq));
+        if (rq->skip_clock_update > 0)
+                return;
+        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+        rq->clock += delta;
+        update_rq_clock_task(rq, delta);
 }
 /*
@@ -677,10 +683,9 @@ inline void update_rq_clock(struct rq *rq)
 #endif
 /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
 * @cpu: the processor in question.
 *
- * Returns true if the current cpu runqueue is locked.
 * This interface allows printk to be called with the runqueue lock
 * held and know whether or not it is OK to wake up the klogd.
 */
@@ -741,7 +746,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int i;
@@ -752,16 +757,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
-        if (strncmp(buf, "NO_", 3) == 0) {
+        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
                cmp += 3;
        }
        for (i = 0; sched_feat_names[i]; i++) {
-                int len = strlen(sched_feat_names[i]);
+                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                if (strncmp(cmp, sched_feat_names[i], len) == 0) {
                        if (neg)
                                sysctl_sched_features &= ~(1UL << i);
                        else
@@ -811,20 +815,6 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
@@ -871,18 +861,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
        return rq->curr == p;
 }
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+        return p->on_cpu;
+#else
        return task_current(rq, p);
+#endif
 }
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
        rq->lock.owner = current;
@@ -898,15 +909,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->oncpu;
-#else
-        return task_current(rq, p);
-#endif
-}
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -915,7 +917,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
         * SMP rebalancing from interrupt is the only thing that cares
         * here.
         */
-        next->oncpu = 1;
+        next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        raw_spin_unlock_irq(&rq->lock);
@@ -928,12 +930,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
        /*
-         * After ->oncpu is cleared, the task can be moved to a different CPU.
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
         */
        smp_wmb();
-        prev->oncpu = 0;
+        prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
@@ -942,23 +944,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * __task_rq_lock - lock the rq @p resides on.
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-        return unlikely(p->state == TASK_WAKING);
-}
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
        struct rq *rq;
+        lockdep_assert_held(&p->pi_lock);
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
@@ -969,22 +963,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 }
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
 */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+        __acquires(p->pi_lock)
        __acquires(rq->lock)
 {
        struct rq *rq;
        for (;;) {
-                local_irq_save(*flags);
+                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-                raw_spin_unlock_irqrestore(&rq->lock, *flags);
+                raw_spin_unlock(&rq->lock);
+                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
        }
 }
@@ -994,10 +988,13 @@ static void __task_rq_unlock(struct rq *rq)
        raw_spin_unlock(&rq->lock);
 }
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
        __releases(rq->lock)
+        __releases(p->pi_lock)
 {
-        raw_spin_unlock_irqrestore(&rq->lock, *flags);
+        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
@@ -1227,11 +1224,17 @@ int get_nohz_timer_target(void)
        int i;
        struct sched_domain *sd;
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
-                for_each_cpu(i, sched_domain_span(sd))
+                for_each_cpu(i, sched_domain_span(sd)) {
-                        if (!idle_cpu(i))
+                        if (!idle_cpu(i)) {
-                                return i;
+                                cpu = i;
+                                goto unlock;
+                        }
+                }
        }
+unlock:
+        rcu_read_unlock();
        return cpu;
 }
 /*
@@ -1341,15 +1344,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
        u64 tmp;
+        /*
+         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+         * 2^SCHED_LOAD_RESOLUTION.
+         */
+        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+                tmp = (u64)delta_exec * scale_load_down(weight);
+        else
+                tmp = (u64)delta_exec;
        if (!lw->inv_weight) {
-                if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+                unsigned long w = scale_load_down(lw->weight);
+                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
                        lw->inv_weight = 1;
+                else if (unlikely(!w))
+                        lw->inv_weight = WMULT_CONST;
                else
-                        lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+                        lw->inv_weight = WMULT_CONST / w;
-                                / (lw->weight+1);
        }
-        tmp = (u64)delta_exec * weight;
        /*
         * Check whether we'd overflow the 64-bit multiplication:
         */
@@ -1374,6 +1389,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
        lw->inv_weight = 0;
 }
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1562,101 +1583,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                    unsigned long sd_shares,
-                                    unsigned long sd_rq_weight,
-                                    unsigned long *usd_rq_weight)
-{
-        unsigned long shares, rq_weight;
-        int boost = 0;
-        rq_weight = usd_rq_weight[cpu];
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        /*
-         *             \Sum_j shares_j * rq_weight_i
-         * shares_i =  -----------------------------
-         *                  \Sum_j rq_weight_j
-         */
-        shares = (sd_shares * rq_weight) / sd_rq_weight;
-        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-        if (abs(shares - tg->se[cpu]->load.weight) >
-                        sysctl_sched_shares_thresh) {
-                struct rq *rq = cpu_rq(cpu);
-                unsigned long flags;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                __set_se_shares(tg->se[cpu], shares);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-}
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-        unsigned long *usd_rq_weight;
-        struct sched_domain *sd = data;
-        unsigned long flags;
-        int i;
-        if (!tg->se[0])
-                return 0;
-        local_irq_save(flags);
-        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-        for_each_cpu(i, sched_domain_span(sd)) {
-                weight = tg->cfs_rq[i]->load.weight;
-                usd_rq_weight[i] = weight;
-                rq_weight += weight;
-                /*
-                 * If there are currently no tasks on the cpu pretend there
-                 * is one of average load so that when a new task gets to
-                 * run here it will not get delayed by group starvation.
-                 */
-                if (!weight)
-                        weight = NICE_0_LOAD;
-                sum_weight += weight;
-                shares += tg->cfs_rq[i]->shares;
-        }
-        if (!rq_weight)
-                rq_weight = sum_weight;
-        if ((!shares && rq_weight) || shares > tg->shares)
-                shares = tg->shares;
-        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-                shares = tg->shares;
-        for_each_cpu(i, sched_domain_span(sd))
-                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-        local_irq_restore(flags);
-        return 0;
-}
 /*
 * Compute the cpu's hierarchical load factor for each task group.
 * This needs to be done in a top-down fashion because the load of a child
@@ -1671,7 +1597,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                load = cpu_rq(cpu)->load.weight;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->cfs_rq[cpu]->shares;
+                load *= tg->se[cpu]->load.weight;
                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
        }
@@ -1680,34 +1606,11 @@ static int tg_load_down(struct task_group *tg, void *data)
        return 0;
 }
-static void update_shares(struct sched_domain *sd)
-{
-        s64 elapsed;
-        u64 now;
-        if (root_task_group_empty())
-                return;
-        now = local_clock();
-        elapsed = now - sd->last_update;
-        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-                sd->last_update = now;
-                walk_tg_tree(tg_nop, tg_shares_up, sd);
-        }
-}
 static void update_h_load(long cpu)
 {
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
-#else
-static inline void update_shares(struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1827,15 +1730,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
-#endif
+#else /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
 {
-#ifdef CONFIG_SMP
+        BUG_ON(!irqs_disabled());
-        cfs_rq->shares = shares;
+        BUG_ON(rq1 != rq2);
-#endif
+        raw_spin_lock(&rq1->lock);
+        __acquire(rq2->lock);   /* Fake it out ;) */
 }
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        BUG_ON(rq1 != rq2);
+        raw_spin_unlock(&rq1->lock);
+        __release(rq2->lock);
+}
 #endif
 static void calc_load_account_idle(struct rq *this_rq);
@@ -1877,23 +1804,20 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
-        if (task_has_rt_policy(p)) {
+        int prio = p->static_prio - MAX_RT_PRIO;
-                p->se.load.weight = 0;
+        struct load_weight *load = &p->se.load;
-                p->se.load.inv_weight = WMULT_CONST;
-                return;
-        }
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
        if (p->policy == SCHED_IDLE) {
-                p->se.load.weight = WEIGHT_IDLEPRIO;
+                load->weight = scale_load(WEIGHT_IDLEPRIO);
-                p->se.load.inv_weight = WMULT_IDLEPRIO;
+                load->inv_weight = WMULT_IDLEPRIO;
                return;
        }
-        p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
+        load->weight = scale_load(prio_to_weight[prio]);
-        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+        load->inv_weight = prio_to_wmult[prio];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1901,7 +1825,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, flags);
-        p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1909,7 +1832,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, flags);
-        p->se.on_rq = 0;
 }
 /*
@@ -1936,14 +1858,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
        dec_nr_running(rq);
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+void enable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 1;
+}
+void disable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 0;
+}
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+static inline void irq_time_write_begin(void)
+{
+        __this_cpu_inc(irq_time_seq.sequence);
+        smp_wmb();
+}
+static inline void irq_time_write_end(void)
+{
+        smp_wmb();
+        __this_cpu_inc(irq_time_seq.sequence);
+}
+static inline u64 irq_time_read(int cpu)
+{
+        u64 irq_time;
+        unsigned seq;
+        do {
+                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                           per_cpu(cpu_hardirq_time, cpu);
+        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+        return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+static inline void irq_time_write_end(void)
+{
+}
+static inline u64 irq_time_read(int cpu)
+{
+        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void account_system_vtime(struct task_struct *curr)
+{
+        unsigned long flags;
+        s64 delta;
+        int cpu;
+        if (!sched_clock_irqtime)
+                return;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+        __this_cpu_add(irq_start_time, delta);
+        irq_time_write_begin();
+        /*
+         * We do not account for softirq time from ksoftirqd here.
+         * We want to continue accounting softirq time to ksoftirqd thread
+         * in that case, so as not to confuse scheduler with a special task
+         * that do not consume any time, but still wants to run.
+         */
+        if (hardirq_count())
+                __this_cpu_add(cpu_hardirq_time, delta);
+        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+                __this_cpu_add(cpu_softirq_time, delta);
+        irq_time_write_end();
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+        s64 irq_delta;
+        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+        /*
+         * Since irq_time is only updated on {soft,}irq_exit, we might run into
+         * this case when a previous update_rq_clock() happened inside a
+         * {soft,}irq region.
+         *
+         * When this happens, we stop ->clock_task and only update the
+         * prev_irq_time stamp to account for the part that fit, so that a next
+         * update will consume the rest. This ensures ->clock_task is
+         * monotonic.
+         *
+         * It does however cause some slight miss-attribution of {soft,}irq
+         * time, a more accurate solution would be to update the irq_time using
+         * the current rq->clock timestamp, except that would require using
+         * atomic ops.
+         */
+        if (irq_delta > delta)
+                irq_delta = delta;
+        rq->prev_irq_time += irq_delta;
+        delta -= irq_delta;
+        rq->clock_task += delta;
+        if (irq_delta && sched_feat(NONIRQ_POWER))
+                sched_rt_avg_update(rq, irq_delta);
+}
+static int irqtime_account_hi_update(void)
+{
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        unsigned long flags;
+        u64 latest_ns;
+        int ret = 0;
+        local_irq_save(flags);
+        latest_ns = this_cpu_read(cpu_hardirq_time);
+        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+                ret = 1;
+        local_irq_restore(flags);
+        return ret;
+}
+static int irqtime_account_si_update(void)
+{
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        unsigned long flags;
+        u64 latest_ns;
+        int ret = 0;
+        local_irq_save(flags);
+        latest_ns = this_cpu_read(cpu_softirq_time);
+        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+                ret = 1;
+        local_irq_restore(flags);
+        return ret;
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#define sched_clock_irqtime     (0)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+        rq->clock_task += delta;
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
+#include "sched_stoptask.c"
 #include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        struct task_struct *old_stop = cpu_rq(cpu)->stop;
+        if (stop) {
+                /*
+                 * Make it appear like a SCHED_FIFO task, its something
+                 * userspace knows about and won't get confused about.
+                 *
+                 * Also, it will make PI more or less work without too
+                 * much confusion -- but then, stop work should not
+                 * rely on PI working anyway.
+                 */
+                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+                stop->sched_class = &stop_sched_class;
+        }
+        cpu_rq(cpu)->stop = stop;
+        if (old_stop) {
+                /*
+                 * Reset it back to a normal scheduling class so that
+                 * it can die in pieces.
+                 */
+                old_stop->sched_class = &rt_sched_class;
+        }
+}
 /*
 * __normal_prio - return the priority that is based on the static prio
 */
@@ -2001,14 +2136,43 @@ inline int task_curr(const struct task_struct *p)
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
-                                       int oldprio, int running)
+                                       int oldprio)
 {
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
-                        prev_class->switched_from(rq, p, running);
+                        prev_class->switched_from(rq, p);
-                p->sched_class->switched_to(rq, p, running);
+                p->sched_class->switched_to(rq, p);
-        } else
+        } else if (oldprio != p->prio)
-                p->sched_class->prio_changed(rq, p, oldprio, running);
+                p->sched_class->prio_changed(rq, p, oldprio);
+}
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+        const struct sched_class *class;
+        if (p->sched_class == rq->curr->sched_class) {
+                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+        } else {
+                for_each_class(class) {
+                        if (class == rq->curr->sched_class)
+                                break;
+                        if (class == p->sched_class) {
+                                resched_task(rq->curr);
+                                break;
+                        }
+                }
+        }
+        /*
+         * A queue event has occurred, and we're going to schedule.  In
+         * this case, we can save a useless back to back clock update.
+         */
+        /* LITMUS^RT:
+         * The "disable-clock-update" approach was buggy in Linux 2.6.36.
+         * The issue has been solved in 2.6.37.
+         */
+        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+                rq->skip_clock_update = 1;
 }
 #ifdef CONFIG_SMP
@@ -2023,6 +2187,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        if (p->sched_class != &fair_sched_class)
                return 0;
+        if (unlikely(p->policy == SCHED_IDLE))
+                return 0;
        /*
         * Buddy candidates are cache hot:
         */
@@ -2050,6 +2217,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#ifdef CONFIG_LOCKDEP
+        /*
+         * The caller should hold either p->pi_lock or rq->lock, when changing
+         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+         *
+         * sched_move_task() holds both and thus holding either pins the cgroup,
+         * see set_task_rq().
+         *
+         * Furthermore, all task_rq users should acquire both locks, see
+         * task_rq_lock().
+         */
+        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                      lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
        trace_sched_migrate_task(p, new_cpu);
@@ -2070,21 +2252,6 @@ struct migration_arg {
 static int migration_cpu_stop(void *data);
 /*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
-{
-        struct rq *rq = task_rq(p);
-        /*
-         * If the task is not on a runqueue (and not running), then
-         * the next wake-up will properly place the task.
-         */
-        return p->se.on_rq || task_running(rq, p);
-}
-/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2141,11 +2308,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-                on_rq = p->se.on_rq;
+                on_rq = p->on_rq;
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                /*
                 * If it changed from the expected state, bail out now.
@@ -2174,7 +2341,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * yield - it could be a while.
                 */
                if (unlikely(on_rq)) {
-                        schedule_timeout_uninterruptible(1);
+                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                        continue;
                }
@@ -2196,7 +2366,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 * Cause a process which is running on another CPU to enter
 * kernel-mode, without any delay. (to get signals handled.)
 *
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
 * because all it wants to ensure is that the remote task enters
 * the kernel. If the IPI races and the task has been migrated
 * to another CPU then no harm is done and the purpose has been
@@ -2215,30 +2385,9 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:          the task to evaluate
- * @func:       the function to be called
- * @info:       the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-                              void (*func) (void *info), void *info)
-{
-        int cpu;
-        preempt_disable();
-        cpu = task_cpu(p);
-        if (task_curr(p))
-                smp_call_function_single(cpu, func, info, 1);
-        preempt_enable();
-}
 #ifdef CONFIG_SMP
 /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -2256,30 +2405,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
        /* No more Mr. Nice Guy. */
-        if (unlikely(dest_cpu >= nr_cpu_ids)) {
+        dest_cpu = cpuset_cpus_allowed_fallback(p);
-                dest_cpu = cpuset_cpus_allowed_fallback(p);
+        /*
-                /*
+         * Don't tell them about moving exiting tasks or
-                 * Don't tell them about moving exiting tasks or
+         * kernel threads (both mm NULL), since they never
-                 * kernel threads (both mm NULL), since they never
+         * leave kernel.
-                 * leave kernel.
+         */
-                 */
+        if (p->mm && printk_ratelimit()) {
-                if (p->mm && printk_ratelimit()) {
+                printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                        printk(KERN_INFO "process %d (%s) no "
+                                task_pid_nr(p), p->comm, cpu);
-                               "longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
-                }
        }
        return dest_cpu;
 }
 /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2305,27 +2451,63 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+static void
-                                 bool is_sync, bool is_migrate, bool is_local,
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-                                 unsigned long en_flags)
 {
-        schedstat_inc(p, se.statistics.nr_wakeups);
+#ifdef CONFIG_SCHEDSTATS
-        if (is_sync)
+        struct rq *rq = this_rq();
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (is_migrate)
+#ifdef CONFIG_SMP
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+        int this_cpu = smp_processor_id();
-        if (is_local)
+        if (cpu == this_cpu) {
+                schedstat_inc(rq, ttwu_local);
                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        else
+        } else {
+                struct sched_domain *sd;
                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+                rcu_read_lock();
+                for_each_domain(this_cpu, sd) {
+                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                                schedstat_inc(sd, ttwu_wake_remote);
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+        }
+        if (wake_flags & WF_MIGRATED)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+#endif /* CONFIG_SMP */
+        schedstat_inc(rq, ttwu_count);
+        schedstat_inc(p, se.statistics.nr_wakeups);
+        if (wake_flags & WF_SYNC)
+                schedstat_inc(p, se.statistics.nr_wakeups_sync);
+#endif /* CONFIG_SCHEDSTATS */
+}
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
        activate_task(rq, p, en_flags);
+        p->on_rq = 1;
+        /* if a worker is waking up, notify workqueue */
+        if (p->flags & PF_WQ_WORKER)
+                wq_worker_waking_up(p, cpu_of(rq));
 }
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+/*
-                                        int wake_flags, bool success)
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-        trace_sched_wakeup(p, success);
+        trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -2344,9 +2526,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
                rq->idle_stamp = 0;
        }
 #endif
-        /* if a worker is waking up, notify workqueue */
+}
-        if ((p->flags & PF_WQ_WORKER) && success)
-                wq_worker_waking_up(p, cpu_of(rq));
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+        if (p->sched_contributes_to_load)
+                rq->nr_uninterruptible--;
+#endif
+        ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+        ttwu_do_wakeup(rq, p, wake_flags);
+}
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_rq) {
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#ifdef CONFIG_SMP
+static void sched_ttwu_do_pending(struct task_struct *list)
+{
+        struct rq *rq = this_rq();
+        raw_spin_lock(&rq->lock);
+        while (list) {
+                struct task_struct *p = list;
+                list = list->wake_entry;
+                ttwu_do_activate(rq, p, 0);
+        }
+        raw_spin_unlock(&rq->lock);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_ttwu_pending(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        sched_ttwu_do_pending(list);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+void scheduler_ipi(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        /*
+         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+         * traditionally all their work was done from the interrupt return
+         * path. Now that we actually do some work, we need to make sure
+         * we do call them.
+         *
+         * Some archs already do call them, luckily irq_enter/exit nest
+         * properly.
+         *
+         * Arguably we should visit all archs and update all handlers,
+         * however a fair share of IPIs are still resched only so this would
+         * somewhat pessimize the simple resched case.
+         */
+        irq_enter();
+        sched_ttwu_do_pending(list);
+        irq_exit();
+}
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        struct task_struct *next = rq->wake_list;
+        for (;;) {
+                struct task_struct *old = next;
+                p->wake_entry = next;
+                next = cmpxchg(&rq->wake_list, old, p);
+                if (next == old)
+                        break;
+        }
+        if (!next)
+                smp_send_reschedule(cpu);
+}
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_cpu) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+#if defined(CONFIG_SMP)
+        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                sched_clock_cpu(cpu); /* sync clocks x-cpu */
+                ttwu_queue_remote(p, cpu);
+                return;
+        }
+#endif
+        raw_spin_lock(&rq->lock);
+        ttwu_do_activate(rq, p, 0);
+        raw_spin_unlock(&rq->lock);
 }
 /**
@@ -2364,97 +2688,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 * Returns %true if @p was woken up, %false if it was already running
 * or @state didn't match @p's state.
 */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
+static int
-                          int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        unsigned long en_flags = ENQUEUE_WAKEUP;
+        int cpu, success = 0;
-        struct rq *rq;
        if (is_realtime(p))
                TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
-        this_cpu = get_cpu();
        smp_wmb();
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
-        if (p->se.on_rq)
+        success = 1; /* we're going to change ->state */
-                goto out_running;
        cpu = task_cpu(p);
-        orig_cpu = cpu;
-#ifdef CONFIG_SMP
+        if (p->on_rq && ttwu_remote(p, wake_flags))
-        if (unlikely(task_running(rq, p)) || is_realtime(p))
+                goto stat;
-                goto out_activate;
+#ifdef CONFIG_SMP
        /*
-         * In order to handle concurrent wakeups and release the rq->lock
+         * If the owning (remote) cpu is still in the middle of schedule() with
-         * we put the task in TASK_WAKING state.
+         * this task as prev, wait until its done referencing the task.
-         *
-         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p)) {
+        while (p->on_cpu) {
-                if (likely(cpu_online(orig_cpu)))
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-                        rq->nr_uninterruptible--;
+                /*
-                else
+                 * In case the architecture enables interrupts in
-                        this_rq()->nr_uninterruptible--;
+                 * context_switch(), we cannot busy wait, since that
-        }
+                 * would lead to deadlocks when an interrupt hits and
-        p->state = TASK_WAKING;
+                 * tries to wake up @prev. So bail and do a complete
+                 * remote wakeup.
-        if (p->sched_class->task_waking) {
+                 */
-                p->sched_class->task_waking(rq, p);
+                if (ttwu_activate_remote(p, wake_flags))
-                en_flags |= ENQUEUE_WAKING;
+                        goto stat;
+#else
+                cpu_relax();
+#endif
        }
+        /*
+         * Pairs with the smp_wmb() in finish_lock_switch().
+         */
+        smp_rmb();
-        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+        /* LITMUS^RT: once the task can be safely referenced by this
-        if (cpu != orig_cpu)
+         * CPU, don't mess up with Linux load balancing stuff.
-                set_task_cpu(p, cpu);
+         */
-        __task_rq_unlock(rq);
+        if (is_realtime(p))
+                goto litmus_out_activate;
-        rq = cpu_rq(cpu);
+        p->sched_contributes_to_load = !!task_contributes_to_load(p);
-        raw_spin_lock(&rq->lock);
+        p->state = TASK_WAKING;
-        /*
+        if (p->sched_class->task_waking)
-         * We migrated the task without holding either rq->lock, however
+                p->sched_class->task_waking(p);
-         * since the task is not on the task list itself, nobody else
-         * will try and migrate the task, hence the rq should match the
-         * cpu we just moved it to.
-         */
-        WARN_ON(task_cpu(p) != cpu);
-        WARN_ON(p->state != TASK_WAKING);
-#ifdef CONFIG_SCHEDSTATS
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        schedstat_inc(rq, ttwu_count);
+        if (task_cpu(p) != cpu) {
-        if (cpu == this_cpu)
+                wake_flags |= WF_MIGRATED;
-                schedstat_inc(rq, ttwu_local);
+                set_task_cpu(p, cpu);
-        else {
-                struct sched_domain *sd;
-                for_each_domain(this_cpu, sd) {
-                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
-                                break;
-                        }
-                }
        }
-#endif /* CONFIG_SCHEDSTATS */
-out_activate:
+litmus_out_activate:
 #endif /* CONFIG_SMP */
-        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                      cpu == this_cpu, en_flags);
+        ttwu_queue(p, cpu);
-        success = 1;
+stat:
-out_running:
+        ttwu_stat(p, cpu, wake_flags);
-        ttwu_post_activation(p, rq, wake_flags, success);
 out:
        if (is_realtime(p))
                TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-        put_cpu();
        return success;
 }
@@ -2463,31 +2769,34 @@ out:
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
 *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
 */
 static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        bool success = false;
        BUG_ON(rq != this_rq());
        BUG_ON(p == current);
        lockdep_assert_held(&rq->lock);
+        if (!raw_spin_trylock(&p->pi_lock)) {
+                raw_spin_unlock(&rq->lock);
+                raw_spin_lock(&p->pi_lock);
+                raw_spin_lock(&rq->lock);
+        }
        if (!(p->state & TASK_NORMAL))
-                return;
+                goto out;
-        if (!p->se.on_rq) {
+        if (!p->on_rq)
-                if (likely(!task_running(rq, p))) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-                        schedstat_inc(rq, ttwu_count);
-                        schedstat_inc(rq, ttwu_local);
+        ttwu_do_wakeup(rq, p, 0);
-                }
+        ttwu_stat(p, smp_processor_id(), 0);
-                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+out:
-                success = true;
+        raw_spin_unlock(&p->pi_lock);
-        }
-        ttwu_post_activation(p, rq, 0, success);
 }
 /**
@@ -2520,18 +2829,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 */
 static void __sched_fork(struct task_struct *p)
 {
+        p->on_rq                        = 0;
+        p->se.on_rq                     = 0;
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
+        p->se.vruntime                  = 0;
+        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
-        p->se.on_rq = 0;
-        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2541,8 +2853,9 @@ static void __sched_fork(struct task_struct *p)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
 {
+        unsigned long flags;
        int cpu = get_cpu();
        __sched_fork(p);
@@ -2594,22 +2907,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
         *
         * Silence PROVE_RCU.
         */
-        rcu_read_lock();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        set_task_cpu(p, cpu);
-        rcu_read_unlock();
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        p->oncpu = 0;
+        p->on_cpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
        put_cpu();
 }
@@ -2621,41 +2936,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
 {
        unsigned long flags;
        struct rq *rq;
-        int cpu __maybe_unused = get_cpu();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
-        rq = task_rq_lock(p, &flags);
-        p->state = TASK_WAKING;
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
-         *
-         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-         * without people poking at ->cpus_allowed.
         */
-        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-        set_task_cpu(p, cpu);
-        p->state = TASK_RUNNING;
-        task_rq_unlock(rq, &flags);
 #endif
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-        trace_sched_wakeup_new(p, 1);
+        p->on_rq = 1;
+        trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 #endif
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
-        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2733,9 +3038,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+        sched_info_switch(prev, next);
+        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
+        trace_sched_switch(prev, next);
 }
 /**
@@ -2879,7 +3187,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
-        trace_sched_switch(prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -2889,14 +3197,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
-        if (likely(!mm)) {
+        if (!mm) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (likely(!prev->mm)) {
+        if (!prev->mm) {
                prev->active_mm = NULL;
                rq->prev_mm = oldmm;
        }
@@ -3011,6 +3319,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3040,6 +3357,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3049,6 +3488,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3066,24 +3509,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
@@ -3244,27 +3680,22 @@ void sched_exec(void)
 {
        struct task_struct *p = current;
        unsigned long flags;
-        struct rq *rq;
        int dest_cpu;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
-        /*
+        if (likely(cpu_active(dest_cpu))) {
-         * select_task_rq() can race against ->cpus_allowed
-         */
-        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
                struct migration_arg arg = { p, dest_cpu };
-                task_rq_unlock(rq, &flags);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                return;
        }
 unlock:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
@@ -3285,7 +3716,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
        if (task_current(rq, p)) {
                update_rq_clock(rq);
-                ns = rq->clock - p->se.exec_start;
+                ns = rq->clock_task - p->se.exec_start;
                if ((s64)ns < 0)
                        ns = 0;
        }
@@ -3301,7 +3732,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3319,7 +3750,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3343,7 +3774,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        thread_group_cputime(p, &totals);
        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3408,6 +3839,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 }
 /*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                        cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+        cputime64_t tmp = cputime_to_cputime64(cputime);
+        /* Add system time to process. */
+        p->stime = cputime_add(p->stime, cputime);
+        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+        account_group_system_time(p, cputime);
+        /* Add system time to cpustat. */
+        *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+        /* Account for system time used */
+        acct_update_integrals(p);
+}
+/*
 * Account system cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3418,36 +3875,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-        cputime64_t tmp;
+        cputime64_t *target_cputime64;
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
                return;
        }
-        /* Add system time to process. */
-        p->stime = cputime_add(p->stime, cputime);
-        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-        account_group_system_time(p, cputime);
-        /* Add system time to cpustat. */
-        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
-                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+                target_cputime64 = &cpustat->irq;
-        else if (softirq_count())
+        else if (in_serving_softirq())
-                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+                target_cputime64 = &cpustat->softirq;
        else
-                cpustat->system = cputime64_add(cpustat->system, tmp);
+                target_cputime64 = &cpustat->system;
-        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-        /* Account for system time used */
+        __account_system_time(p, cputime, cputime_scaled, target_cputime64);
-        acct_update_integrals(p);
 }
 /*
 * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
 */
 void account_steal_time(cputime_t cputime)
 {
@@ -3475,6 +3922,73 @@ void account_idle_time(cputime_t cputime)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                                struct rq *rq)
+{
+        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        if (irqtime_account_hi_update()) {
+                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+        } else if (irqtime_account_si_update()) {
+                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+        } else if (this_cpu_ksoftirqd() == p) {
+                /*
+                 * ksoftirqd time do not get accounted in cpu_softirq_time.
+                 * So, we have to handle it separately here.
+                 * Also, p->stime needs to be updated for ksoftirqd.
+                 */
+                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                        &cpustat->softirq);
+        } else if (user_tick) {
+                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+        } else if (p == rq->idle) {
+                account_idle_time(cputime_one_jiffy);
+        } else if (p->flags & PF_VCPU) { /* System time or guest time */
+                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+        } else {
+                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                        &cpustat->system);
+        }
+}
+static void irqtime_account_idle_ticks(int ticks)
+{
+        int i;
+        struct rq *rq = this_rq();
+        for (i = 0; i < ticks; i++)
+                irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                                struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -3485,6 +3999,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
+        if (sched_clock_irqtime) {
+                irqtime_account_process_tick(p, user_tick, rq);
+                return;
+        }
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3510,6 +4029,12 @@ void account_steal_ticks(unsigned long ticks)
 */
 void account_idle_ticks(unsigned long ticks)
 {
+        if (sched_clock_irqtime) {
+                irqtime_account_idle_ticks(ticks);
+                return;
+        }
        account_idle_time(jiffies_to_cputime(ticks));
 }
@@ -3603,9 +4128,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 /*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
 */
 void scheduler_tick(void)
 {
@@ -3627,7 +4149,7 @@ void scheduler_tick(void)
        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick(curr);
+        perf_event_task_tick();
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -3733,19 +4255,12 @@ static inline void schedule_debug(struct task_struct *prev)
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
        schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
-        if (unlikely(prev->lock_depth >= 0)) {
-                schedstat_inc(this_rq(), bkl_count);
-                schedstat_inc(prev, sched_info.bkl_count);
-        }
-#endif
 }
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        if (prev->se.on_rq)
+        if (prev->on_rq || rq->skip_clock_update < 0)
                update_rq_clock(rq);
-        rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3776,17 +4291,13 @@ pick_next_task(struct rq *rq)
        }
        */
-        class = sched_class_highest;
+        for_each_class(class) {
-        for ( ; ; ) {
                p = class->pick_next_task(rq);
                if (p)
                        return p;
-                /*
-                 * Will never be NULL as the idle class always
-                 * returns a non-NULL p:
-                 */
-                class = class->next;
        }
+        BUG(); /* the idle class will always have a runnable task */
 }
 /*
@@ -3807,8 +4318,10 @@ need_resched:
        rcu_note_context_switch(cpu);
        prev = rq->curr;
-        release_kernel_lock(prev);
+        /* LITMUS^RT: quickly re-evaluate the scheduling decision
-need_resched_nonpreemptible:
+         * if the previous one is no longer valid after CTX.
+         */
+litmus_need_resched_nonpreemptible:
        TS_SCHED_START;
        sched_trace_task_switch_away(prev);
@@ -3818,18 +4331,19 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        clear_tsk_need_resched(prev);
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        prev->on_rq = 0;
                        /*
-                         * If a worker is going to sleep, notify and
+                         * If a worker went to sleep, notify and ask workqueue
-                         * ask workqueue whether it wants to wake up a
+                         * whether it wants to wake up a task to maintain
-                         * task to maintain concurrency.  If so, wake
+                         * concurrency.
-                         * up the task.
                         */
                        if (prev->flags & PF_WQ_WORKER) {
                                struct task_struct *to_wakeup;
@@ -3838,7 +4352,16 @@ need_resched_nonpreemptible:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        /*
+                         * If we are going to sleep and we have plugged IO
+                         * queued, make sure to submit it to avoid deadlocks.
+                         */
+                        if (blk_needs_flush_plug(prev)) {
+                                raw_spin_unlock(&rq->lock);
+                                blk_schedule_flush_plug(prev);
+                                raw_spin_lock(&rq->lock);
+                        }
                }
                switch_count = &prev->nvcsw;
        }
@@ -3850,11 +4373,10 @@ need_resched_nonpreemptible:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
+        clear_tsk_need_resched(prev);
+        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
-                sched_info_switch(prev, next);
-                perf_event_task_sched_out(prev, next);
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -3880,8 +4402,8 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev)))
+        if (sched_state_validate_switch())
-                goto need_resched_nonpreemptible;
+                goto litmus_need_resched_nonpreemptible;
        preempt_enable_no_resched();
        if (need_resched())
@@ -3892,70 +4414,53 @@ need_resched_nonpreemptible:
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+        bool ret = false;
+        rcu_read_lock();
+        if (lock->owner != owner)
+                goto fail;
+        /*
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
+         * lock->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
+         */
+        barrier();
+        ret = owner->on_cpu;
+fail:
+        rcu_read_unlock();
+        return ret;
+}
 /*
 * Look out! "owner" is an entirely speculative pointer
 * access and not reliable.
 */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
-        unsigned int cpu;
-        struct rq *rq;
        if (!sched_feat(OWNER_SPIN))
                return 0;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+        while (owner_running(lock, owner)) {
-        /*
+                if (need_resched())
-         * Need to access the cpu field knowing that
+                        return 0;
-         * DEBUG_PAGEALLOC could have unmapped it if
-         * the mutex owner just released it and exited.
-         */
-        if (probe_kernel_address(&owner->cpu, cpu))
-                return 0;
-#else
-        cpu = owner->cpu;
-#endif
-        /*
+                arch_mutex_cpu_relax();
-         * Even if the access succeeded (likely case),
+        }
-         * the cpu field may no longer be valid.
-         */
-        if (cpu >= nr_cpumask_bits)
-                return 0;
        /*
-         * We need to validate that we can do a
+         * If the owner changed to another task there is likely
-         * get_cpu() and that we have the percpu area.
+         * heavy contention, stop spinning.
         */
-        if (!cpu_online(cpu))
+        if (lock->owner)
                return 0;
-        rq = cpu_rq(cpu);
-        for (;;) {
-                /*
-                 * Owner changed, break to re-assess state.
-                 */
-                if (lock->owner != owner) {
-                        /*
-                         * If the lock has switched to a different owner,
-                         * we likely have heavy contention. Return 0 to quit
-                         * optimistic spinning and not contend further:
-                         */
-                        if (lock->owner)
-                                return 0;
-                        break;
-                }
-                /*
-                 * Is that owner really running on that cpu?
-                 */
-                if (task_thread_info(rq->curr) != owner || need_resched())
-                        return 0;
-                cpu_relax();
-        }
        return 1;
 }
 #endif
@@ -4085,6 +4590,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
        __wake_up_common(q, mode, 1, 0, key);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 /**
 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4276,7 +4782,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
 */
-unsigned long __sched
+long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
 {
@@ -4309,7 +4815,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
 */
-unsigned long __sched
+long __sched
 wait_for_completion_killable_timeout(struct completion *x,
                                     unsigned long timeout)
 {
@@ -4425,18 +4931,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
+        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4452,12 +4958,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (on_rq)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
-                check_class_changed(rq, p, prev_class, oldprio, running);
+        check_class_changed(rq, p, prev_class, oldprio);
-        }
+        __task_rq_unlock(rq);
-        task_rq_unlock(rq, &flags);
 }
 #endif
@@ -4485,7 +4990,7 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4505,7 +5010,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_task(rq->curr);
        }
 out_unlock:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -4619,8 +5124,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-        BUG_ON(p->se.on_rq);
        p->policy = policy;
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
@@ -4645,14 +5148,17 @@ static bool check_same_owner(struct task_struct *p)
        rcu_read_lock();
        pcred = __task_cred(p);
-        match = (cred->euid == pcred->euid ||
+        if (cred->user->user_ns == pcred->user->user_ns)
-                 cred->euid == pcred->uid);
+                match = (cred->euid == pcred->euid ||
+                         cred->euid == pcred->uid);
+        else
+                match = false;
        rcu_read_unlock();
        return match;
 }
 static int __sched_setscheduler(struct task_struct *p, int policy,
-                                struct sched_param *param, bool user)
+                                const struct sched_param *param, bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@ -4708,12 +5214,15 @@ recheck:
                            param->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
                /*
-                 * Like positive nice levels, dont allow tasks to
+                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
-                 * move out of SCHED_IDLE either:
+                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
+                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        return -EPERM;
+                        if (!can_nice(p, TASK_NICE(p)))
+                                return -EPERM;
+                }
                /* can't change other user's priorities */
                if (!check_same_owner(p))
@@ -4725,7 +5234,7 @@ recheck:
        }
        if (user) {
-                retval = security_task_setscheduler(p, policy, param);
+                retval = security_task_setscheduler(p);
                if (retval)
                        return retval;
        }
@@ -4739,13 +5248,30 @@ recheck:
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
+         *
+         * To be able to change p->policy safely, the appropriate
+         * runqueue lock must be held.
         */
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
+        rq = task_rq_lock(p, &flags);
        /*
-         * To be able to change p->policy safely, the apropriate
+         * Changing the policy of the stop threads its a very bad idea
-         * runqueue lock must be held.
         */
-        rq = __task_rq_lock(p);
+        if (p == rq->stop) {
+                task_rq_unlock(rq, p, &flags);
+                return -EINVAL;
+        }
+        /*
+         * If not changing anything there's no need to proceed further:
+         */
+        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+                        param->sched_priority == p->rt_priority))) {
+                __task_rq_unlock(rq);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                return 0;
+        }
 #ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
@@ -4754,9 +5280,9 @@ recheck:
                 * assigned.
                 */
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                                task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
-                        __task_rq_unlock(rq);
+                                !task_group_is_autogroup(task_group(p))) {
-                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                        task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
        }
@@ -4765,11 +5291,10 @@ recheck:
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-                __task_rq_unlock(rq);
+                task_rq_unlock(rq, p, &flags);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -4793,13 +5318,11 @@ recheck:
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (on_rq)
                activate_task(rq, p, 0);
-                check_class_changed(rq, p, prev_class, oldprio, running);
+        check_class_changed(rq, p, prev_class, oldprio);
-        }
+        task_rq_unlock(rq, p, &flags);
-        __task_rq_unlock(rq);
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        rt_mutex_adjust_pi(p);
@@ -4815,7 +5338,7 @@ recheck:
 * NOTE that the task may be already dead.
 */
 int sched_setscheduler(struct task_struct *p, int policy,
-                       struct sched_param *param)
+                       const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, true);
 }
@@ -4833,7 +5356,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 * but our caller might not have that capability.
 */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                               struct sched_param *param)
+                               const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, false);
 }
@@ -4980,16 +5503,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+        if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                goto out_unlock;
-        retval = security_task_setscheduler(p, 0, NULL);
+        retval = security_task_setscheduler(p);
        if (retval)
                goto out_unlock;
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
        retval = set_cpus_allowed_ptr(p, new_mask);
        if (!retval) {
@@ -5051,7 +5574,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        unsigned long flags;
-        struct rq *rq;
        int retval;
        get_online_cpus();
@@ -5066,9 +5588,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
@@ -5215,6 +5737,67 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+        struct task_struct *curr = current;
+        struct rq *rq, *p_rq;
+        unsigned long flags;
+        bool yielded = 0;
+        local_irq_save(flags);
+        rq = this_rq();
+again:
+        p_rq = task_rq(p);
+        double_rq_lock(rq, p_rq);
+        while (task_rq(p) != p_rq) {
+                double_rq_unlock(rq, p_rq);
+                goto again;
+        }
+        if (!curr->sched_class->yield_to_task)
+                goto out;
+        if (curr->sched_class != p->sched_class)
+                goto out;
+        if (task_running(p_rq, p) || p->state)
+                goto out;
+        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+        if (yielded) {
+                schedstat_inc(rq, yld_count);
+                /*
+                 * Make p's CPU reschedule; pick_next_entity takes care of
+                 * fairness.
+                 */
+                if (preempt && rq != p_rq)
+                        resched_task(p_rq->curr);
+        }
+out:
+        double_rq_unlock(rq, p_rq);
+        local_irq_restore(flags);
+        if (yielded)
+                schedule();
+        return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
 /*
 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
 * that process accounting knows that this is a task in IO wait state.
@@ -5225,6 +5808,7 @@ void __sched io_schedule(void)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        schedule();
        current->in_iowait = 0;
@@ -5240,6 +5824,7 @@ long __sched io_schedule_timeout(long timeout)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
        current->in_iowait = 0;
@@ -5330,7 +5915,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        rq = task_rq_lock(p, &flags);
        time_slice = p->sched_class->get_rr_interval(rq, p);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
@@ -5350,7 +5935,7 @@ void sched_show_task(struct task_struct *p)
        unsigned state;
        state = p->state ? __ffs(p->state) + 1 : 0;
-        printk(KERN_INFO "%-13.13s %c", p->comm,
+        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
        if (state == TASK_RUNNING)
@@ -5388,7 +5973,7 @@ void show_state_filter(unsigned long state_filter)
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
-                 * console might take alot of time:
+                 * console might take a lot of time:
                 */
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
@@ -5432,26 +6017,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+        do_set_cpus_allowed(idle, cpumask_of(cpu));
+        /*
+         * We're having a chicken and egg problem, even though we are
+         * holding rq->lock, the cpu isn't yet set to this cpu so the
+         * lockdep check in task_group() will fail.
+         *
+         * Similar case to sched_fork(). / Alternatively we could
+         * use task_rq_lock() here and obtain the other rq->lock.
+         *
+         * Silence PROVE_RCU
+         */
+        rcu_read_lock();
        __set_task_cpu(idle, cpu);
+        rcu_read_unlock();
        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        idle->oncpu = 1;
+        idle->on_cpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
-        task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
        task_thread_info(idle)->preempt_count = 0;
-#endif
        /*
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
-        ftrace_graph_init_task(idle);
+        ftrace_graph_init_idle_task(idle, cpu);
 }
 /*
@@ -5502,7 +6096,6 @@ static void update_sysctl(void)
        SET_SYSCTL(sched_min_granularity);
        SET_SYSCTL(sched_latency);
        SET_SYSCTL(sched_wakeup_granularity);
-        SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
@@ -5512,6 +6105,16 @@ static inline void sched_init_granularity(void)
 }
 #ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+        if (p->sched_class && p->sched_class->set_cpus_allowed)
+                p->sched_class->set_cpus_allowed(p, new_mask);
+        else {
+                cpumask_copy(&p->cpus_allowed, new_mask);
+                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        }
+}
 /*
 * This is how migration works:
 *
@@ -5542,52 +6145,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        unsigned int dest_cpu;
        int ret = 0;
-        /*
-         * Serialize against TASK_WAKING so that ttwu() and wunt() can
-         * drop the rq->lock and still rely on ->cpus_allowed.
-         */
-again:
-        while (task_is_waking(p))
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (task_is_waking(p)) {
-                task_rq_unlock(rq, &flags);
+        if (cpumask_equal(&p->cpus_allowed, new_mask))
-                goto again;
+                goto out;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                     !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
-        if (p->sched_class->set_cpus_allowed)
+        do_set_cpus_allowed(p, new_mask);
-                p->sched_class->set_cpus_allowed(p, new_mask);
-        else {
-                cpumask_copy(&p->cpus_allowed, new_mask);
-                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-        }
        /* Can the task run on the task's current CPU? If so, we're done */
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (migrate_task(p, dest_cpu)) {
+        if (p->on_rq) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
 out:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ret;
 }
@@ -5615,6 +6204,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
+        raw_spin_lock(&p->pi_lock);
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
@@ -5627,7 +6217,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-        if (p->se.on_rq) {
+        if (p->on_rq) {
                deactivate_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
                activate_task(rq_dest, p, 0);
@@ -5637,6 +6227,7 @@ done:
        ret = 1;
 fail:
        double_rq_unlock(rq_src, rq_dest);
+        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
@@ -5660,29 +6251,20 @@ static int migration_cpu_stop(void *data)
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
 */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
 {
-        struct rq *rq = cpu_rq(dead_cpu);
+        struct mm_struct *mm = current->active_mm;
-        int needs_cpu, uninitialized_var(dest_cpu);
-        unsigned long flags;
-        local_irq_save(flags);
+        BUG_ON(cpu_online(smp_processor_id()));
-        raw_spin_lock(&rq->lock);
+        if (mm != &init_mm)
-        needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+                switch_mm(mm, &init_mm, current);
-        if (needs_cpu)
+        mmdrop(mm);
-                dest_cpu = select_fallback_rq(dead_cpu, p);
-        raw_spin_unlock(&rq->lock);
-        /*
-         * It can only fail if we race with set_cpus_allowed(),
-         * in the racer should migrate the task anyway.
-         */
-        if (needs_cpu)
-                __migrate_task(p, dead_cpu, dest_cpu);
-        local_irq_restore(flags);
 }
 /*
@@ -5695,128 +6277,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-        unsigned long flags;
-        local_irq_save(flags);
-        double_rq_lock(rq_src, rq_dest);
        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
        rq_src->nr_uninterruptible = 0;
-        double_rq_unlock(rq_src, rq_dest);
-        local_irq_restore(flags);
-}
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-        struct task_struct *p, *t;
-        read_lock(&tasklist_lock);
-        do_each_thread(t, p) {
-                if (p == current)
-                        continue;
-                if (task_cpu(p) == src_cpu)
-                        move_task_off_dead_cpu(src_cpu, p);
-        } while_each_thread(t, p);
-        read_unlock(&tasklist_lock);
 }
 /*
- * Schedules idle task to be the next runnable task on current CPU.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
 */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
 {
-        int this_cpu = smp_processor_id();
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-        struct rq *rq = cpu_rq(this_cpu);
+        rq->calc_load_active = 0;
-        struct task_struct *p = rq->idle;
-        unsigned long flags;
-        /* cpu has to be offline */
-        BUG_ON(cpu_online(this_cpu));
-        /*
-         * Strictly not necessary since rest of the CPUs are stopped by now
-         * and interrupts disabled on the current cpu.
-         */
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-        activate_task(rq, p, 0);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * offline.
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
 */
-void idle_task_exit(void)
+static void migrate_tasks(unsigned int dead_cpu)
-{
-        struct mm_struct *mm = current->active_mm;
-        BUG_ON(cpu_online(smp_processor_id()));
-        if (mm != &init_mm)
-                switch_mm(mm, &init_mm, current);
-        mmdrop(mm);
-}
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
        struct rq *rq = cpu_rq(dead_cpu);
+        struct task_struct *next, *stop = rq->stop;
-        /* Must be exiting, otherwise would be on tasklist. */
+        int dest_cpu;
-        BUG_ON(!p->exit_state);
-        /* Cannot have done final schedule yet: would have vanished. */
-        BUG_ON(p->state == TASK_DEAD);
-        get_task_struct(p);
        /*
-         * Drop lock around migration; if someone else moves it,
+         * Fudge the rq selection such that the below task selection loop
-         * that's OK. No task can be added to this CPU, so iteration is
+         * doesn't get stuck on the currently eligible stop task.
-         * fine.
+         *
+         * We're currently inside stop_machine() and the rq is either stuck
+         * in the stop_machine_cpu_stop() loop, or we're executing this code,
+         * either way we should never end up calling schedule() until we're
+         * done here.
         */
-        raw_spin_unlock_irq(&rq->lock);
+        rq->stop = NULL;
-        move_task_off_dead_cpu(dead_cpu, p);
-        raw_spin_lock_irq(&rq->lock);
-        put_task_struct(p);
-}
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-        struct rq *rq = cpu_rq(dead_cpu);
-        struct task_struct *next;
        for ( ; ; ) {
-                if (!rq->nr_running)
+                /*
+                 * There's this thread running, bail when that's the only
+                 * remaining thread.
+                 */
+                if (rq->nr_running == 1)
                        break;
                next = pick_next_task(rq);
-                if (!next)
+                BUG_ON(!next);
-                        break;
                next->sched_class->put_prev_task(rq, next);
-                migrate_dead(dead_cpu, next);
+                /* Find suitable destination for @next, with force if needed. */
+                dest_cpu = select_fallback_rq(dead_cpu, next);
+                raw_spin_unlock(&rq->lock);
+                __migrate_task(next, dead_cpu, dest_cpu);
+                raw_spin_lock(&rq->lock);
        }
-}
-/*
+        rq->stop = stop;
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-        rq->calc_load_active = 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6026,15 +6549,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        unsigned long flags;
        struct rq *rq = cpu_rq(cpu);
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6046,33 +6567,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                migrate_live_tasks(cpu);
-                /* Idle task back to normal (off runqueue, low prio) */
-                raw_spin_lock_irq(&rq->lock);
-                deactivate_task(rq, rq->idle, 0);
-                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-                rq->idle->sched_class = &idle_sched_class;
-                migrate_dead_tasks(cpu);
-                raw_spin_unlock_irq(&rq->lock);
-                migrate_nr_uninterruptible(rq);
-                BUG_ON(rq->nr_running != 0);
-                calc_global_load_remove(rq);
-                break;
        case CPU_DYING:
-        case CPU_DYING_FROZEN:
+                sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
+                migrate_tasks(cpu);
+                BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
+                migrate_nr_uninterruptible(rq);
+                calc_global_load_remove(rq);
                break;
 #endif
        }
+        update_max_interval();
        return NOTIFY_OK;
 }
@@ -6133,6 +6647,8 @@ early_initcall(migration_init);
 #ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
 static __read_mostly int sched_domain_debug_enabled;
@@ -6183,7 +6699,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->cpu_power) {
+                if (!group->sgp->power) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -6207,9 +6723,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
-                if (group->cpu_power != SCHED_LOAD_SCALE) {
+                if (group->sgp->power != SCHED_POWER_SCALE) {
                        printk(KERN_CONT " (cpu_power = %d)",
-                                group->cpu_power);
+                                group->sgp->power);
                }
                group = group->next;
@@ -6228,7 +6744,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-        cpumask_var_t groupmask;
        int level = 0;
        if (!sched_domain_debug_enabled)
@@ -6241,20 +6756,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-        if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
-                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
-                return;
-        }
        for (;;) {
-                if (sched_domain_debug_one(sd, cpu, level, groupmask))
+                if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
                        break;
                level++;
                sd = sd->parent;
                if (!sd)
                        break;
        }
-        free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6311,12 +6820,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        return 1;
 }
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
 {
-        synchronize_sched();
+        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
        cpupri_cleanup(&rd->cpupri);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -6357,7 +6865,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        if (old_rd)
-                free_rootdomain(old_rd);
+                call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 static int init_rootdomain(struct root_domain *rd)
@@ -6408,6 +6916,53 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+        struct sched_group *tmp, *first;
+        if (!sg)
+                return;
+        first = sg;
+        do {
+                tmp = sg->next;
+                if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+                        kfree(sg->sgp);
+                kfree(sg);
+                sg = tmp;
+        } while (sg != first);
+}
+static void free_sched_domain(struct rcu_head *rcu)
+{
+        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+        /*
+         * If its an overlapping domain it has private groups, iterate and
+         * nuke them all.
+         */
+        if (sd->flags & SD_OVERLAP) {
+                free_sched_groups(sd->groups, 1);
+        } else if (atomic_dec_and_test(&sd->groups->ref)) {
+                kfree(sd->groups->sgp);
+                kfree(sd->groups);
+        }
+        kfree(sd);
+}
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+        call_rcu(&sd->rcu, free_sched_domain);
+}
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+        for (; sd; sd = sd->parent)
+                destroy_sched_domain(sd, cpu);
+}
 /*
 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
 * hold the hotplug lock.
@@ -6418,9 +6973,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
-        for (tmp = sd; tmp; tmp = tmp->parent)
-                tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -6431,12 +6983,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
+                        destroy_sched_domain(parent, cpu);
                } else
                        tmp = tmp->parent;
        }
        if (sd && sd_degenerate(sd)) {
+                tmp = sd;
                sd = sd->parent;
+                destroy_sched_domain(tmp, cpu);
                if (sd)
                        sd->child = NULL;
        }
@@ -6444,7 +6999,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        sched_domain_debug(sd, cpu);
        rq_attach_root(rq, rd);
+        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
+        destroy_sched_domains(tmp, cpu);
 }
 /* cpus with isolated domains */
@@ -6460,56 +7017,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
-                        const struct cpumask *cpu_map,
-                        int (*group_fn)(int cpu, const struct cpumask *cpu_map,
-                                        struct sched_group **sg,
-                                        struct cpumask *tmpmask),
-                        struct cpumask *covered, struct cpumask *tmpmask)
-{
-        struct sched_group *first = NULL, *last = NULL;
-        int i;
-        cpumask_clear(covered);
-        for_each_cpu(i, span) {
-                struct sched_group *sg;
-                int group = group_fn(i, cpu_map, &sg, tmpmask);
-                int j;
-                if (cpumask_test_cpu(i, covered))
-                        continue;
-                cpumask_clear(sched_group_cpus(sg));
-                sg->cpu_power = 0;
-                for_each_cpu(j, span) {
-                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
-                                continue;
-                        cpumask_set_cpu(j, covered);
-                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                }
-                if (!first)
-                        first = sg;
-                if (last)
-                        last->next = sg;
-                last = sg;
-        }
-        last->next = first;
-}
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
@@ -6526,7 +7033,7 @@ init_sched_build_groups(const struct cpumask *span,
 */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
-        int i, n, val, min_val, best_node = 0;
+        int i, n, val, min_val, best_node = -1;
        min_val = INT_MAX;
@@ -6550,7 +7057,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
                }
        }
-        node_set(best_node, *used_nodes);
+        if (best_node != -1)
+                node_set(best_node, *used_nodes);
        return best_node;
 }
@@ -6576,293 +7084,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
+                if (next_node < 0)
+                        break;
                cpumask_or(span, span, cpumask_of_node(next_node));
        }
 }
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+        lockdep_assert_held(&sched_domains_mutex);
+        sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+        return sched_domains_tmpmask;
+}
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+        return cpu_possible_mask;
+}
 #endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+        return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- *   and struct sched_domain. )
- */
-struct static_sched_group {
-        struct sched_group sg;
-        DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
-struct static_sched_domain {
+struct sd_data {
-        struct sched_domain sd;
+        struct sched_domain **__percpu sd;
-        DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+        struct sched_group **__percpu sg;
+        struct sched_group_power **__percpu sgp;
 };
 struct s_data {
-#ifdef CONFIG_NUMA
+        struct sched_domain ** __percpu sd;
-        int                     sd_allnodes;
-        cpumask_var_t           domainspan;
-        cpumask_var_t           covered;
-        cpumask_var_t           notcovered;
-#endif
-        cpumask_var_t           nodemask;
-        cpumask_var_t           this_sibling_map;
-        cpumask_var_t           this_core_map;
-        cpumask_var_t           send_covered;
-        cpumask_var_t           tmpmask;
-        struct sched_group      **sched_group_nodes;
        struct root_domain      *rd;
 };
 enum s_alloc {
-        sa_sched_groups = 0,
        sa_rootdomain,
-        sa_tmpmask,
+        sa_sd,
-        sa_send_covered,
+        sa_sd_storage,
-        sa_this_core_map,
-        sa_this_sibling_map,
-        sa_nodemask,
-        sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
-        sa_notcovered,
-        sa_covered,
-        sa_domainspan,
-#endif
        sa_none,
 };
-/*
+struct sched_domain_topology_level;
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
-static int
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-                 struct sched_group **sg, struct cpumask *unused)
-{
-        if (sg)
-                *sg = &per_cpu(sched_groups, cpu).sg;
-        return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
-/*
+#define SDTL_OVERLAP    0x01
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+struct sched_domain_topology_level {
-static int
+        sched_domain_init_f init;
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+        sched_domain_mask_f mask;
-                  struct sched_group **sg, struct cpumask *mask)
+        int                 flags;
-{
+        struct sd_data      data;
-        int group;
+};
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-        if (sg)
-                *sg = &per_cpu(sched_group_core, group).sg;
-        return group;
-}
-#elif defined(CONFIG_SCHED_MC)
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-                  struct sched_group **sg, struct cpumask *unused)
 {
-        if (sg)
+        struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
-                *sg = &per_cpu(sched_group_core, cpu).sg;
+        const struct cpumask *span = sched_domain_span(sd);
-        return cpu;
+        struct cpumask *covered = sched_domains_tmpmask;
-}
+        struct sd_data *sdd = sd->private;
-#endif
+        struct sched_domain *child;
+        int i;
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+        cpumask_clear(covered);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
-static int
+        for_each_cpu(i, span) {
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                struct cpumask *sg_span;
-                  struct sched_group **sg, struct cpumask *mask)
-{
-        int group;
-#ifdef CONFIG_SCHED_MC
-        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#else
-        group = cpu;
-#endif
-        if (sg)
-                *sg = &per_cpu(sched_group_phys, group).sg;
-        return group;
-}
-#ifdef CONFIG_NUMA
+                if (cpumask_test_cpu(i, covered))
-/*
+                        continue;
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
+                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+                                GFP_KERNEL, cpu_to_node(i));
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                if (!sg)
-                                 struct sched_group **sg,
+                        goto fail;
-                                 struct cpumask *nodemask)
-{
-        int group;
-        cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+                sg_span = sched_group_cpus(sg);
-        group = cpumask_first(nodemask);
-        if (sg)
+                child = *per_cpu_ptr(sdd->sd, i);
-                *sg = &per_cpu(sched_group_allnodes, group).sg;
+                if (child->child) {
-        return group;
+                        child = child->child;
-}
+                        cpumask_copy(sg_span, sched_domain_span(child));
+                } else
+                        cpumask_set_cpu(i, sg_span);
-static void init_numa_sched_groups_power(struct sched_group *group_head)
+                cpumask_or(covered, covered, sg_span);
-{
-        struct sched_group *sg = group_head;
-        int j;
-        if (!sg)
+                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
-                return;
+                atomic_inc(&sg->sgp->ref);
-        do {
-                for_each_cpu(j, sched_group_cpus(sg)) {
-                        struct sched_domain *sd;
-                        sd = &per_cpu(phys_domains, j).sd;
+                if (cpumask_test_cpu(cpu, sg_span))
-                        if (j != group_first_cpu(sd->groups)) {
+                        groups = sg;
-                                /*
-                                 * Only add "power" once for each
-                                 * physical package.
-                                 */
-                                continue;
-                        }
-                        sg->cpu_power += sd->groups->cpu_power;
+                if (!first)
-                }
+                        first = sg;
-                sg = sg->next;
+                if (last)
-        } while (sg != group_head);
+                        last->next = sg;
+                last = sg;
+                last->next = first;
+        }
+        sd->groups = groups;
+        return 0;
+fail:
+        free_sched_groups(first, 0);
+        return -ENOMEM;
 }
-static int build_numa_sched_groups(struct s_data *d,
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-                                   const struct cpumask *cpu_map, int num)
 {
-        struct sched_domain *sd;
+        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-        struct sched_group *sg, *prev;
+        struct sched_domain *child = sd->child;
-        int n, j;
-        cpumask_clear(d->covered);
+        if (child)
-        cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+                cpu = cpumask_first(sched_domain_span(child));
-        if (cpumask_empty(d->nodemask)) {
-                d->sched_group_nodes[num] = NULL;
+        if (sg) {
-                goto out;
+                *sg = *per_cpu_ptr(sdd->sg, cpu);
+                (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+                atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
        }
-        sched_domain_node_span(num, d->domainspan);
+        return cpu;
-        cpumask_and(d->domainspan, d->domainspan, cpu_map);
+}
-        sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+/*
-                          GFP_KERNEL, num);
+ * build_sched_groups will build a circular linked list of the groups
-        if (!sg) {
+ * covered by the given span, and will set each group's ->cpumask correctly,
-                printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+ * and ->cpu_power to 0.
-                       num);
+ *
-                return -ENOMEM;
+ * Assumes the sched_domain tree is fully constructed
-        }
+ */
-        d->sched_group_nodes[num] = sg;
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+        struct sched_group *first = NULL, *last = NULL;
+        struct sd_data *sdd = sd->private;
+        const struct cpumask *span = sched_domain_span(sd);
+        struct cpumask *covered;
+        int i;
-        for_each_cpu(j, d->nodemask) {
+        get_group(cpu, sdd, &sd->groups);
-                sd = &per_cpu(node_domains, j).sd;
+        atomic_inc(&sd->groups->ref);
-                sd->groups = sg;
-        }
-        sg->cpu_power = 0;
+        if (cpu != cpumask_first(sched_domain_span(sd)))
-        cpumask_copy(sched_group_cpus(sg), d->nodemask);
+                return 0;
-        sg->next = sg;
-        cpumask_or(d->covered, d->covered, d->nodemask);
-        prev = sg;
+        lockdep_assert_held(&sched_domains_mutex);
-        for (j = 0; j < nr_node_ids; j++) {
+        covered = sched_domains_tmpmask;
-                n = (num + j) % nr_node_ids;
-                cpumask_complement(d->notcovered, d->covered);
-                cpumask_and(d->tmpmask, d->notcovered, cpu_map);
-                cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
-                if (cpumask_empty(d->tmpmask))
-                        break;
-                cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
-                if (cpumask_empty(d->tmpmask))
-                        continue;
-                sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                  GFP_KERNEL, num);
-                if (!sg) {
-                        printk(KERN_WARNING
-                               "Can not alloc domain group for node %d\n", j);
-                        return -ENOMEM;
-                }
-                sg->cpu_power = 0;
-                cpumask_copy(sched_group_cpus(sg), d->tmpmask);
-                sg->next = prev->next;
-                cpumask_or(d->covered, d->covered, d->tmpmask);
-                prev->next = sg;
-                prev = sg;
-        }
-out:
-        return 0;
-}
-#endif /* CONFIG_NUMA */
-#ifdef CONFIG_NUMA
+        cpumask_clear(covered);
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                              struct cpumask *nodemask)
-{
-        int cpu, i;
-        for_each_cpu(cpu, cpu_map) {
+        for_each_cpu(i, span) {
-                struct sched_group **sched_group_nodes
+                struct sched_group *sg;
-                        = sched_group_nodes_bycpu[cpu];
+                int group = get_group(i, sdd, &sg);
+                int j;
-                if (!sched_group_nodes)
+                if (cpumask_test_cpu(i, covered))
                        continue;
-                for (i = 0; i < nr_node_ids; i++) {
+                cpumask_clear(sched_group_cpus(sg));
-                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                sg->sgp->power = 0;
-                        cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                for_each_cpu(j, span) {
-                        if (cpumask_empty(nodemask))
+                        if (get_group(j, sdd, NULL) != group)
                                continue;
-                        if (sg == NULL)
+                        cpumask_set_cpu(j, covered);
-                                continue;
+                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                        sg = sg->next;
-next_sg:
-                        oldsg = sg;
-                        sg = sg->next;
-                        kfree(oldsg);
-                        if (oldsg != sched_group_nodes[i])
-                                goto next_sg;
                }
-                kfree(sched_group_nodes);
-                sched_group_nodes_bycpu[cpu] = NULL;
+                if (!first)
+                        first = sg;
+                if (last)
+                        last->next = sg;
+                last = sg;
        }
+        last->next = first;
+        return 0;
 }
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                              struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
 /*
 * Initialize sched groups cpu_power.
@@ -6876,46 +7288,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
 */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-        struct sched_domain *child;
+        struct sched_group *sg = sd->groups;
-        struct sched_group *group;
-        long power;
-        int weight;
-        WARN_ON(!sd || !sd->groups);
-        if (cpu != group_first_cpu(sd->groups))
-                return;
-        child = sd->child;
+        WARN_ON(!sd || !sg);
-        sd->groups->cpu_power = 0;
+        do {
+                sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+                sg = sg->next;
+        } while (sg != sd->groups);
-        if (!child) {
+        if (cpu != group_first_cpu(sg))
-                power = SCHED_LOAD_SCALE;
-                weight = cpumask_weight(sched_domain_span(sd));
-                /*
-                 * SMT siblings share the power of a single core.
-                 * Usually multiple threads get a better yield out of
-                 * that one core than a single thread would have,
-                 * reflect that in sd->smt_gain.
-                 */
-                if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                        power *= sd->smt_gain;
-                        power /= weight;
-                        power >>= SCHED_LOAD_SHIFT;
-                }
-                sd->groups->cpu_power += power;
                return;
-        }
-        /*
+        update_group_power(sd, cpu);
-         * Add cpu_power of each child group to this groups cpu_power.
-         */
-        group = child->groups;
-        do {
-                sd->groups->cpu_power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
 }
 /*
@@ -6929,15 +7314,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 # define SD_INIT_NAME(sd, type)         do { } while (0)
 #endif
-#define SD_INIT(sd, type)       sd_init_##type(sd)
+#define SD_INIT_FUNC(type)                                              \
+static noinline struct sched_domain *                                   \
-#define SD_INIT_FUNC(type)      \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-static noinline void sd_init_##type(struct sched_domain *sd)    \
+{                                                                       \
-{                                                               \
+        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        memset(sd, 0, sizeof(*sd));                             \
+        *sd = SD_##type##_INIT;                                         \
-        *sd = SD_##type##_INIT;                                 \
+        SD_INIT_NAME(sd, type);                                         \
-        sd->level = SD_LV_##type;                               \
+        sd->private = &tl->data;                                        \
-        SD_INIT_NAME(sd, type);                                 \
+        return sd;                                                      \
 }
 SD_INIT_FUNC(CPU)
@@ -6951,15 +7336,19 @@ SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_MC
 SD_INIT_FUNC(MC)
 #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
 static int default_relax_domain_level = -1;
+int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
        unsigned long val;
        val = simple_strtoul(str, NULL, 0);
-        if (val < SD_LV_MAX)
+        if (val < sched_domain_level_max)
                default_relax_domain_level = val;
        return 1;
@@ -6987,35 +7376,20 @@ static void set_domain_attribute(struct sched_domain *sd,
        }
 }
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                                 const struct cpumask *cpu_map)
 {
        switch (what) {
-        case sa_sched_groups:
-                free_sched_groups(cpu_map, d->tmpmask); /* fall through */
-                d->sched_group_nodes = NULL;
        case sa_rootdomain:
-                free_rootdomain(d->rd); /* fall through */
+                if (!atomic_read(&d->rd->refcount))
-        case sa_tmpmask:
+                        free_rootdomain(&d->rd->rcu); /* fall through */
-                free_cpumask_var(d->tmpmask); /* fall through */
+        case sa_sd:
-        case sa_send_covered:
+                free_percpu(d->sd); /* fall through */
-                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_sd_storage:
-        case sa_this_core_map:
+                __sdt_free(cpu_map); /* fall through */
-                free_cpumask_var(d->this_core_map); /* fall through */
-        case sa_this_sibling_map:
-                free_cpumask_var(d->this_sibling_map); /* fall through */
-        case sa_nodemask:
-                free_cpumask_var(d->nodemask); /* fall through */
-        case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
-                kfree(d->sched_group_nodes); /* fall through */
-        case sa_notcovered:
-                free_cpumask_var(d->notcovered); /* fall through */
-        case sa_covered:
-                free_cpumask_var(d->covered); /* fall through */
-        case sa_domainspan:
-                free_cpumask_var(d->domainspan); /* fall through */
-#endif
        case sa_none:
                break;
        }
@@ -7024,270 +7398,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                                                   const struct cpumask *cpu_map)
 {
-#ifdef CONFIG_NUMA
+        memset(d, 0, sizeof(*d));
-        if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
-                return sa_none;
+        if (__sdt_alloc(cpu_map))
-        if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+                return sa_sd_storage;
-                return sa_domainspan;
+        d->sd = alloc_percpu(struct sched_domain *);
-        if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+        if (!d->sd)
-                return sa_covered;
+                return sa_sd_storage;
-        /* Allocate the per-node list of sched groups */
-        d->sched_group_nodes = kcalloc(nr_node_ids,
-                                      sizeof(struct sched_group *), GFP_KERNEL);
-        if (!d->sched_group_nodes) {
-                printk(KERN_WARNING "Can not alloc sched group node list\n");
-                return sa_notcovered;
-        }
-        sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
-        if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
-                return sa_sched_group_nodes;
-        if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
-                return sa_nodemask;
-        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
-                return sa_this_sibling_map;
-        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
-                return sa_this_core_map;
-        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
-                return sa_send_covered;
        d->rd = alloc_rootdomain();
-        if (!d->rd) {
+        if (!d->rd)
-                printk(KERN_WARNING "Cannot alloc root domain\n");
+                return sa_sd;
-                return sa_tmpmask;
-        }
        return sa_rootdomain;
 }
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+/*
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
 {
-        struct sched_domain *sd = NULL;
+        struct sd_data *sdd = sd->private;
-#ifdef CONFIG_NUMA
-        struct sched_domain *parent;
-        d->sd_allnodes = 0;
-        if (cpumask_weight(cpu_map) >
-            SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
-                sd = &per_cpu(allnodes_domains, i).sd;
-                SD_INIT(sd, ALLNODES);
-                set_domain_attribute(sd, attr);
-                cpumask_copy(sched_domain_span(sd), cpu_map);
-                cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
-                d->sd_allnodes = 1;
-        }
-        parent = sd;
-        sd = &per_cpu(node_domains, i).sd;
-        SD_INIT(sd, NODE);
-        set_domain_attribute(sd, attr);
-        sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-        sd->parent = parent;
-        if (parent)
-                parent->child = sd;
-        cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
-        return sd;
-}
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        *per_cpu_ptr(sdd->sd, cpu) = NULL;
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd;
-        sd = &per_cpu(phys_domains, i).sd;
-        SD_INIT(sd, CPU);
-        set_domain_attribute(sd, attr);
-        cpumask_copy(sched_domain_span(sd), d->nodemask);
-        sd->parent = parent;
-        if (parent)
-                parent->child = sd;
-        cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
-        return sd;
-}
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+                *per_cpu_ptr(sdd->sg, cpu) = NULL;
-        struct sched_domain *parent, int i)
-{
+        if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-        struct sched_domain *sd = parent;
+                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-#ifdef CONFIG_SCHED_MC
-        sd = &per_cpu(core_domains, i).sd;
-        SD_INIT(sd, MC);
-        set_domain_attribute(sd, attr);
-        cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
 }
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_SMT
-        sd = &per_cpu(cpu_domains, i).sd;
+static const struct cpumask *cpu_smt_mask(int cpu)
-        SD_INIT(sd, SIBLING);
+{
-        set_domain_attribute(sd, attr);
+        return topology_thread_cpumask(cpu);
-        cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
 }
+#endif
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+/*
-                               const struct cpumask *cpu_map, int cpu)
+ * Topology list, bottom-up.
-{
+ */
-        switch (l) {
+static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-        case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+        { sd_init_SIBLING, cpu_smt_mask, },
-                cpumask_and(d->this_sibling_map, cpu_map,
-                            topology_thread_cpumask(cpu));
-                if (cpu == cpumask_first(d->this_sibling_map))
-                        init_sched_build_groups(d->this_sibling_map, cpu_map,
-                                                &cpu_to_cpu_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
 #ifdef CONFIG_SCHED_MC
-        case SD_LV_MC: /* set up multi-core groups */
+        { sd_init_MC, cpu_coregroup_mask, },
-                cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
-                if (cpu == cpumask_first(d->this_core_map))
-                        init_sched_build_groups(d->this_core_map, cpu_map,
-                                                &cpu_to_core_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
-        case SD_LV_CPU: /* set up physical groups */
+#ifdef CONFIG_SCHED_BOOK
-                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+        { sd_init_BOOK, cpu_book_mask, },
-                if (!cpumask_empty(d->nodemask))
+#endif
-                        init_sched_build_groups(d->nodemask, cpu_map,
+        { sd_init_CPU, cpu_cpu_mask, },
-                                                &cpu_to_phys_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #ifdef CONFIG_NUMA
-        case SD_LV_ALLNODES:
+        { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-                init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+        { sd_init_ALLNODES, cpu_allnodes_mask, },
-                                        d->send_covered, d->tmpmask);
-                break;
 #endif
-        default:
+        { NULL, },
-                break;
+};
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+        struct sched_domain_topology_level *tl;
+        int j;
+        for (tl = sched_domain_topology; tl->init; tl++) {
+                struct sd_data *sdd = &tl->data;
+                sdd->sd = alloc_percpu(struct sched_domain *);
+                if (!sdd->sd)
+                        return -ENOMEM;
+                sdd->sg = alloc_percpu(struct sched_group *);
+                if (!sdd->sg)
+                        return -ENOMEM;
+                sdd->sgp = alloc_percpu(struct sched_group_power *);
+                if (!sdd->sgp)
+                        return -ENOMEM;
+                for_each_cpu(j, cpu_map) {
+                        struct sched_domain *sd;
+                        struct sched_group *sg;
+                        struct sched_group_power *sgp;
+                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sd)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sd, j) = sd;
+                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sg)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sg, j) = sg;
+                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sgp)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sgp, j) = sgp;
+                }
+        }
+        return 0;
+}
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+        struct sched_domain_topology_level *tl;
+        int j;
+        for (tl = sched_domain_topology; tl->init; tl++) {
+                struct sd_data *sdd = &tl->data;
+                for_each_cpu(j, cpu_map) {
+                        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+                        if (sd && (sd->flags & SD_OVERLAP))
+                                free_sched_groups(sd->groups, 0);
+                        kfree(*per_cpu_ptr(sdd->sg, j));
+                        kfree(*per_cpu_ptr(sdd->sgp, j));
+                }
+                free_percpu(sdd->sd);
+                free_percpu(sdd->sg);
+                free_percpu(sdd->sgp);
+        }
+}
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+                struct s_data *d, const struct cpumask *cpu_map,
+                struct sched_domain_attr *attr, struct sched_domain *child,
+                int cpu)
+{
+        struct sched_domain *sd = tl->init(tl, cpu);
+        if (!sd)
+                return child;
+        set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+        if (child) {
+                sd->level = child->level + 1;
+                sched_domain_level_max = max(sched_domain_level_max, sd->level);
+                child->parent = sd;
        }
+        sd->child = child;
+        return sd;
 }
 /*
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static int __build_sched_domains(const struct cpumask *cpu_map,
+static int build_sched_domains(const struct cpumask *cpu_map,
-                                 struct sched_domain_attr *attr)
+                               struct sched_domain_attr *attr)
 {
        enum s_alloc alloc_state = sa_none;
-        struct s_data d;
        struct sched_domain *sd;
-        int i;
+        struct s_data d;
-#ifdef CONFIG_NUMA
+        int i, ret = -ENOMEM;
-        d.sd_allnodes = 0;
-#endif
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
        if (alloc_state != sa_rootdomain)
                goto error;
-        alloc_state = sa_sched_groups;
-        /*
-         * Set up domains for cpus specified by the cpu_map.
-         */
-        for_each_cpu(i, cpu_map) {
-                cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
-                            cpu_map);
-                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
-                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
-        }
+        /* Set up domains for cpus specified by the cpu_map. */
        for_each_cpu(i, cpu_map) {
-                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+                struct sched_domain_topology_level *tl;
-                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
-        }
+                sd = NULL;
+                for (tl = sched_domain_topology; tl->init; tl++) {
-        /* Set up physical groups */
+                        sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
-        for (i = 0; i < nr_node_ids; i++)
+                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
-                build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+                                sd->flags |= SD_OVERLAP;
+                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-#ifdef CONFIG_NUMA
+                                break;
-        /* Set up node groups */
+                }
-        if (d.sd_allnodes)
-                build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-        for (i = 0; i < nr_node_ids; i++)
+                while (sd->child)
-                if (build_numa_sched_groups(&d, cpu_map, i))
+                        sd = sd->child;
-                        goto error;
-#endif
-        /* Calculate CPU power for physical packages and nodes */
+                *per_cpu_ptr(d.sd, i) = sd;
-#ifdef CONFIG_SCHED_SMT
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(cpu_domains, i).sd;
-                init_sched_groups_power(i, sd);
-        }
-#endif
-#ifdef CONFIG_SCHED_MC
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(core_domains, i).sd;
-                init_sched_groups_power(i, sd);
        }
-#endif
+        /* Build the groups for the domains */
        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(phys_domains, i).sd;
+                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                init_sched_groups_power(i, sd);
+                        sd->span_weight = cpumask_weight(sched_domain_span(sd));
+                        if (sd->flags & SD_OVERLAP) {
+                                if (build_overlap_sched_groups(sd, i))
+                                        goto error;
+                        } else {
+                                if (build_sched_groups(sd, i))
+                                        goto error;
+                        }
+                }
        }
-#ifdef CONFIG_NUMA
+        /* Calculate CPU power for physical packages and nodes */
-        for (i = 0; i < nr_node_ids; i++)
+        for (i = nr_cpumask_bits-1; i >= 0; i--) {
-                init_numa_sched_groups_power(d.sched_group_nodes[i]);
+                if (!cpumask_test_cpu(i, cpu_map))
+                        continue;
-        if (d.sd_allnodes) {
-                struct sched_group *sg;
-                cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
+                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                                                                d.tmpmask);
+                        claim_allocations(i, sd);
-                init_numa_sched_groups_power(sg);
+                        init_sched_groups_power(i, sd);
+                }
        }
-#endif
        /* Attach the domains */
+        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
+                sd = *per_cpu_ptr(d.sd, i);
-                sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
-                sd = &per_cpu(core_domains, i).sd;
-#else
-                sd = &per_cpu(phys_domains, i).sd;
-#endif
                cpu_attach_domain(sd, d.rd, i);
        }
+        rcu_read_unlock();
-        d.sched_group_nodes = NULL; /* don't free this we still need it */
+        ret = 0;
-        __free_domain_allocs(&d, sa_tmpmask, cpu_map);
-        return 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
-        return -ENOMEM;
+        return ret;
-}
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
-        return __build_sched_domains(cpu_map, NULL);
 }
 static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7342,7 +7679,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
 */
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
 {
        int err;
@@ -7353,32 +7690,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
-        err = build_sched_domains(doms_cur[0]);
+        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
        return err;
 }
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
-                                       struct cpumask *tmpmask)
-{
-        free_sched_groups(cpu_map, tmpmask);
-}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map
 * These cpus will now be attached to the NULL domain
 */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-        /* Save because hotplug lock held. */
-        static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
+        rcu_read_lock();
        for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
-        synchronize_sched();
+        rcu_read_unlock();
-        arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 /* handle null as "default" */
@@ -7467,8 +7796,7 @@ match1:
                                goto match2;
                }
                /* no match - add a new doms_new */
-                __build_sched_domains(doms_new[i],
+                build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-                                        dattr_new ? dattr_new + i : NULL);
 match2:
                ;
        }
@@ -7487,7 +7815,7 @@ match2:
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
 {
        get_online_cpus();
@@ -7520,7 +7848,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
        else
                sched_mc_power_savings = level;
-        arch_reinit_sched_domains();
+        reinit_sched_domains();
        return count;
 }
@@ -7639,14 +7967,9 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
-        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
-                                                                GFP_KERNEL);
-        BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_active_mask);
+        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7691,8 +8014,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
        INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        cfs_rq->rq = rq;
+        /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+        cfs_rq->load_stamp = 1;
+#endif
 #endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7733,18 +8063,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                                struct sched_entity *se, int cpu, int add,
+                                struct sched_entity *se, int cpu,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
        tg->cfs_rq[cpu] = cfs_rq;
        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
-        if (add)
-                list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
        tg->se[cpu] = se;
-        /* se could be NULL for init_task_group */
+        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -7754,15 +8082,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        se->load.weight = tg->shares;
+        update_load_set(&se->load, 0);
-        se->load.inv_weight = 0;
        se->parent = parent;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu, int add,
+                struct sched_rt_entity *rt_se, int cpu,
                struct sched_rt_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -7771,8 +8098,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-        if (add)
-                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
@@ -7807,18 +8132,18 @@ void __init sched_init(void)
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                init_task_group.se = (struct sched_entity **)ptr;
+                root_task_group.se = (struct sched_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-                init_task_group.rt_rq = (struct rt_rq **)ptr;
+                root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
@@ -7838,20 +8163,16 @@ void __init sched_init(void)
                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
-        init_rt_bandwidth(&init_task_group.rt_bandwidth,
+        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
-        list_add(&init_task_group.list, &task_groups);
+        list_add(&root_task_group.list, &task_groups);
-        INIT_LIST_HEAD(&init_task_group.children);
+        INIT_LIST_HEAD(&root_task_group.children);
+        autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                            __alignof__(unsigned long));
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -7863,38 +8184,34 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                init_task_group.shares = init_task_group_load;
+                root_task_group.shares = root_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                /*
-                 * How much cpu bandwidth does init_task_group get?
+                 * How much cpu bandwidth does root_task_group get?
                 *
                 * In case of task-groups formed thr' the cgroup filesystem, it
                 * gets 100% of the cpu resources in the system. This overall
                 * system cpu resource is divided among the tasks of
-                 * init_task_group and its child task-groups in a fair manner,
+                 * root_task_group and its child task-groups in a fair manner,
                 * based on each entity's (task or task-group's) weight
                 * (se->load.weight).
                 *
-                 * In other words, if init_task_group has 10 tasks of weight
+                 * In other words, if root_task_group has 10 tasks of weight
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
-                 * We achieve this by letting init_task_group's tasks sit
+                 * We achieve this by letting root_task_group's tasks sit
-                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
-                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
+                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
-#endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
+                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
-                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
 #endif
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7905,7 +8222,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-                rq->cpu_power = SCHED_LOAD_SCALE;
+                rq->cpu_power = SCHED_POWER_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -7962,6 +8279,7 @@ void __init sched_init(void)
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
+        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -7974,8 +8292,6 @@ void __init sched_init(void)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
-        perf_event_init();
        scheduler_running = 1;
 }
@@ -7984,7 +8300,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+        return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8019,9 +8335,11 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+        const struct sched_class *prev_class = p->sched_class;
+        int old_prio = p->prio;
        int on_rq;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8029,6 +8347,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                activate_task(rq, p, 0);
                resched_task(rq->curr);
        }
+        check_class_changed(rq, p, prev_class, old_prio);
 }
 void normalize_rt_tasks(void)
@@ -8144,7 +8464,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
-        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8157,8 +8476,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        tg->shares = NICE_0_LOAD;
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8169,26 +8486,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
-                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(cfs_rq);
- err:
+err:
        return 0;
 }
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-        list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                        &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
-        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        /*
+        * Only empty task groups can be destroyed; so we can speculatively
+        * check on_list without danger of it being re-added.
+        */
+        if (!tg->cfs_rq[cpu]->on_list)
+                return;
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
@@ -8201,10 +8524,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
@@ -8233,7 +8552,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
        struct sched_rt_entity *rt_se;
-        struct rq *rq;
        int i;
        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8565,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                rt_rq = kzalloc_node(sizeof(struct rt_rq),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
@@ -8259,27 +8575,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(rt_rq);
- err:
+err:
        return 0;
 }
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-        list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                        &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
@@ -8290,14 +8595,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        return 1;
 }
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
@@ -8305,6 +8602,7 @@ static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
+        autogroup_free(tg);
        kfree(tg);
 }
@@ -8313,7 +8611,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
        unsigned long flags;
-        int i;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -8326,10 +8623,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                goto err;
        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i) {
-                register_fair_sched_group(tg, i);
-                register_rt_sched_group(tg, i);
-        }
        list_add_rcu(&tg->list, &task_groups);
        WARN_ON(!parent); /* root should already exist */
@@ -8359,11 +8652,11 @@ void sched_destroy_group(struct task_group *tg)
        unsigned long flags;
        int i;
-        spin_lock_irqsave(&task_group_lock, flags);
+        /* end participation in shares distribution */
-        for_each_possible_cpu(i) {
+        for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
-                unregister_rt_sched_group(tg, i);
-        }
+        spin_lock_irqsave(&task_group_lock, flags);
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8386,57 +8679,30 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
        running = task_current(rq, tsk);
-        on_rq = tsk->se.on_rq;
+        on_rq = tsk->on_rq;
        if (on_rq)
                dequeue_task(rq, tsk, 0);
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
-        set_task_rq(tsk, task_cpu(tsk));
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        if (tsk->sched_class->moved_group)
+        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->moved_group(tsk, on_rq);
+                tsk->sched_class->task_move_group(tsk, on_rq);
+        else
 #endif
+                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
                enqueue_task(rq, tsk, 0);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-        struct cfs_rq *cfs_rq = se->cfs_rq;
-        int on_rq;
-        on_rq = se->on_rq;
-        if (on_rq)
-                dequeue_entity(cfs_rq, se, 0);
-        se->load.weight = shares;
-        se->load.inv_weight = 0;
-        if (on_rq)
-                enqueue_entity(cfs_rq, se, 0);
-}
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-        struct cfs_rq *cfs_rq = se->cfs_rq;
-        struct rq *rq = cfs_rq->rq;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __set_se_shares(se, shares);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8450,46 +8716,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (!tg->se[0])
                return -EINVAL;
-        if (shares < MIN_SHARES)
+        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-                shares = MIN_SHARES;
-        else if (shares > MAX_SHARES)
-                shares = MAX_SHARES;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
                goto done;
-        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i)
-                unregister_fair_sched_group(tg, i);
-        list_del_rcu(&tg->siblings);
-        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for any ongoing reference to this group to finish */
-        synchronize_sched();
-        /*
-         * Now we are free to modify the group's share on each cpu
-         * w/o tripping rebalance_share or load_balance_fair.
-         */
        tg->shares = shares;
        for_each_possible_cpu(i) {
-                /*
+                struct rq *rq = cpu_rq(i);
-                 * force a rebalance
+                struct sched_entity *se;
-                 */
-                cfs_rq_set_shares(tg->cfs_rq[i], 0);
+                se = tg->se[i];
-                set_se_shares(tg->se[i], shares);
+                /* Propagate contribution to hierarchy */
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                for_each_sched_entity(se)
+                        update_cfs_shares(group_cfs_rq(se));
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
-        /*
-         * Enable load balance activity on this group, by inserting it back on
-         * each cpu's rq->leaf_cfs_rq_list.
-         */
-        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i)
-                register_fair_sched_group(tg, i);
-        list_add_rcu(&tg->siblings, &tg->parent->children);
-        spin_unlock_irqrestore(&task_group_lock, flags);
 done:
        mutex_unlock(&shares_mutex);
        return 0;
@@ -8624,7 +8869,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
@@ -8788,7 +9033,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-                return &init_task_group.css;
+                return &root_task_group.css;
        }
        parent = cgroup_tg(cgrp->parent);
@@ -8821,56 +9066,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        return 0;
 }
-static int
+static void
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                      struct task_struct *tsk, bool threadgroup)
 {
-        int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
+        sched_move_task(tsk);
-        if (retval)
-                return retval;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        retval = cpu_cgroup_can_attach_task(cgrp, c);
-                        if (retval) {
-                                rcu_read_unlock();
-                                return retval;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
 }
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                  struct cgroup *old_cont, struct task_struct *tsk,
+                struct cgroup *old_cgrp, struct task_struct *task)
-                  bool threadgroup)
 {
-        sched_move_task(tsk);
+        /*
-        if (threadgroup) {
+         * cgroup_exit() is called in the copy_process() failure path.
-                struct task_struct *c;
+         * Ignore this case since the task hasn't ran yet, this avoids
-                rcu_read_lock();
+         * trying to poke a half freed task state from generic code.
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+         */
-                        sched_move_task(c);
+        if (!(task->flags & PF_EXITING))
-                }
+                return;
-                rcu_read_unlock();
-        }
+        sched_move_task(task);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
-        return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+        return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
 }
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
        struct task_group *tg = cgroup_tg(cgrp);
-        return (u64) tg->shares;
+        return (u64) scale_load_down(tg->shares);
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8929,8 +9157,9 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-        .can_attach     = cpu_cgroup_can_attach,
+        .can_attach_task = cpu_cgroup_can_attach_task,
-        .attach         = cpu_cgroup_attach,
+        .attach_task    = cpu_cgroup_attach_task,
+        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
        .early_init     = 1,
@@ -9215,72 +9444,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-void synchronize_sched_expedited(void)
-{
-        barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#else /* #ifndef CONFIG_SMP */
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-        /*
-         * There must be a full memory barrier on each affected CPU
-         * between the time that try_stop_cpus() is called and the
-         * time that it returns.
-         *
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
-         * this point and the following smp_mb() is not strictly
-         * necessary.  Do smp_mb() anyway for documentation and
-         * robustness against future implementation changes.
-         */
-        smp_mb(); /* See above comment block. */
-        return 0;
-}
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-        int snap, trycount = 0;
-        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-        get_online_cpus();
-        while (try_stop_cpus(cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                if (trycount++ < 10)
-                        udelay(trycount * num_online_cpus());
-                else {
-                        synchronize_sched();
-                        return;
-                }
-                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                        smp_mb(); /* ensure test happens before caller kfree */
-                        return;
-                }
-                get_online_cpus();
-        }
-        atomic_inc(&synchronize_sched_expedited_count);
-        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..429242f3c484
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,275 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+static void __init autogroup_init(struct task_struct *init_task)
+{
+        autogroup_default.tg = &root_task_group;
+        kref_init(&autogroup_default.kref);
+        init_rwsem(&autogroup_default.lock);
+        init_task->signal->autogroup = &autogroup_default;
+}
+static inline void autogroup_free(struct task_group *tg)
+{
+        kfree(tg->autogroup);
+}
+static inline void autogroup_destroy(struct kref *kref)
+{
+        struct autogroup *ag = container_of(kref, struct autogroup, kref);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /* We've redirected RT tasks to the root task group... */
+        ag->tg->rt_se = NULL;
+        ag->tg->rt_rq = NULL;
+#endif
+        sched_destroy_group(ag->tg);
+}
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+        kref_put(&ag->kref, autogroup_destroy);
+}
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+        kref_get(&ag->kref);
+        return ag;
+}
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+        struct autogroup *ag;
+        unsigned long flags;
+        if (!lock_task_sighand(p, &flags))
+                return autogroup_kref_get(&autogroup_default);
+        ag = autogroup_kref_get(p->signal->autogroup);
+        unlock_task_sighand(p, &flags);
+        return ag;
+}
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+static inline struct autogroup *autogroup_create(void)
+{
+        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+        struct task_group *tg;
+        if (!ag)
+                goto out_fail;
+        tg = sched_create_group(&root_task_group);
+        if (IS_ERR(tg))
+                goto out_free;
+        kref_init(&ag->kref);
+        init_rwsem(&ag->lock);
+        ag->id = atomic_inc_return(&autogroup_seq_nr);
+        ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Autogroup RT tasks are redirected to the root task group
+         * so we don't have to move tasks around upon policy change,
+         * or flail around trying to allocate bandwidth on the fly.
+         * A bandwidth exception in __sched_setscheduler() allows
+         * the policy change to proceed.  Thereafter, task_group()
+         * returns &root_task_group, so zero bandwidth is required.
+         */
+        free_rt_sched_group(tg);
+        tg->rt_se = root_task_group.rt_se;
+        tg->rt_rq = root_task_group.rt_rq;
+#endif
+        tg->autogroup = ag;
+        return ag;
+out_free:
+        kfree(ag);
+out_fail:
+        if (printk_ratelimit()) {
+                printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                        ag ? "sched_create_group()" : "kmalloc()");
+        }
+        return autogroup_kref_get(&autogroup_default);
+}
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+        if (tg != &root_task_group)
+                return false;
+        if (p->sched_class != &fair_sched_class)
+                return false;
+        /*
+         * We can only assume the task group can't go away on us if
+         * autogroup_move_group() can see us on ->thread_group list.
+         */
+        if (p->flags & PF_EXITING)
+                return false;
+        return true;
+}
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return !!tg->autogroup;
+}
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (enabled && task_wants_autogroup(p, tg))
+                return p->signal->autogroup->tg;
+        return tg;
+}
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+        struct autogroup *prev;
+        struct task_struct *t;
+        unsigned long flags;
+        BUG_ON(!lock_task_sighand(p, &flags));
+        prev = p->signal->autogroup;
+        if (prev == ag) {
+                unlock_task_sighand(p, &flags);
+                return;
+        }
+        p->signal->autogroup = autogroup_kref_get(ag);
+        if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+                goto out;
+        t = p;
+        do {
+                sched_move_task(t);
+        } while_each_thread(p, t);
+out:
+        unlock_task_sighand(p, &flags);
+        autogroup_kref_put(prev);
+}
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+        struct autogroup *ag = autogroup_create();
+        autogroup_move_group(p, ag);
+        /* drop extra reference added by autogroup_create() */
+        autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+        autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+        sig->autogroup = autogroup_task_get(current);
+}
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+        autogroup_kref_put(sig->autogroup);
+}
+static int __init setup_autogroup(char *str)
+{
+        sysctl_sched_autogroup_enabled = 0;
+        return 1;
+}
+__setup("noautogroup", setup_autogroup);
+#ifdef CONFIG_PROC_FS
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+        static unsigned long next = INITIAL_JIFFIES;
+        struct autogroup *ag;
+        int err;
+        if (*nice < -20 || *nice > 19)
+                return -EINVAL;
+        err = security_task_setnice(current, *nice);
+        if (err)
+                return err;
+        if (*nice < 0 && !can_nice(current, *nice))
+                return -EPERM;
+        /* this is a heavy operation taking global locks.. */
+        if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+                return -EAGAIN;
+        next = HZ / 10 + jiffies;
+        ag = autogroup_task_get(p);
+        down_write(&ag->lock);
+        err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+        if (!err)
+                ag->nice = *nice;
+        up_write(&ag->lock);
+        autogroup_kref_put(ag);
+        return err;
+}
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+        struct autogroup *ag = autogroup_task_get(p);
+        if (!task_group_is_autogroup(ag->tg))
+                goto out;
+        down_read(&ag->lock);
+        seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+        up_read(&ag->lock);
+out:
+        autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+        if (!task_group_is_autogroup(tg))
+                return 0;
+        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..05577055cfca
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,41 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+struct autogroup {
+        /*
+         * reference doesn't mean how many thread attach to this
+         * autogroup now. It just stands for the number of task
+         * could use this autogroup.
+         */
+        struct kref             kref;
+        struct task_group       *tg;
+        struct rw_semaphore     lock;
+        unsigned long           id;
+        int                     nice;
+};
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+#else /* !CONFIG_SCHED_AUTOGROUP */
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return 0;
+}
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        return tg;
+}
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+        return 0;
+}
+#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock);
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 __read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
 /*
 * This allows printing both to /proc/sched_debug and
 * to the console
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
-                struct task_group *tg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!se)
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
 }
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+static char *task_group_path(struct task_group *tg)
+{
+        if (autogroup_path(tg, group_path, PATH_MAX))
+                return group_path;
+        /*
+         * May be NULL if the underlying cgroup isn't fully-created yet
+         */
+        if (!tg->css.cgroup) {
+                group_path[0] = '\0';
+                return group_path;
+        }
+        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        return group_path;
+}
+#endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 #ifdef CONFIG_CGROUP_SCHED
-        {
+        SEQ_printf(m, " %s", task_group_path(task_group(p)));
-                char path[64];
-                rcu_read_lock();
-                cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-                rcu_read_unlock();
-                SEQ_printf(m, " %s", path);
-        }
 #endif
        SEQ_printf(m, "\n");
 }
@@ -138,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
-                if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+                if (!p->on_rq || task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_unlock_irqrestore(&tasklist_lock, flags);
 }
-#if defined(CONFIG_CGROUP_SCHED) && \
-        (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-        /* may be NULL if the underlying cgroup isn't fully-created yet */
-        if (!tg->css.cgroup) {
-                buf[0] = '\0';
-                return;
-        }
-        cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_FAIR_GROUP_SCHED
-        char path[128];
+        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
-        struct task_group *tg = cfs_rq->tg;
-        task_group_path(tg, path, sizeof(path));
-        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -183,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        raw_spin_lock_irqsave(&rq->lock, flags);
        if (cfs_rq->rb_leftmost)
-                MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+                MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
        last = __pick_last_entity(cfs_rq);
        if (last)
                max_vruntime = last->vruntime;
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        spread0 = min_vruntime - rq0_min_vruntime;
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
                        SPLIT_NS(spread0));
-        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
+        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                        SPLIT_NS(cfs_rq->load_avg));
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                        SPLIT_NS(cfs_rq->load_period));
+        SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                        cfs_rq->load_contribution);
+        SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                        atomic_read(&cfs_rq->tg->load_weight));
 #endif
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
+#ifdef CONFIG_RT_GROUP_SCHED
-        char path[128];
+        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
-        struct task_group *tg = rt_rq->tg;
-        task_group_path(tg, path, sizeof(path));
-        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
 #endif
 #define P(x) \
        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
 #define PN(x) \
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 #undef P
 }
+extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
 #ifdef CONFIG_X86
        {
@@ -296,14 +296,17 @@ static void print_cpu(struct seq_file *m, int cpu)
        P(ttwu_count);
        P(ttwu_local);
-        P(bkl_count);
 #undef P
+#undef P64
 #endif
+        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
+        rcu_read_lock();
        print_rq(m, rq, cpu);
+        rcu_read_unlock();
+        spin_unlock_irqrestore(&sched_debug_lock, flags);
 }
 static const char *sched_tunable_scaling_names[] = {
@@ -314,21 +317,42 @@ static const char *sched_tunable_scaling_names[] = {
 static int sched_debug_show(struct seq_file *m, void *v)
 {
-        u64 now = ktime_to_ns(ktime_get());
+        u64 ktime, sched_clk, cpu_clk;
+        unsigned long flags;
        int cpu;
-        SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+        local_irq_save(flags);
+        ktime = ktime_to_ns(ktime_get());
+        sched_clk = sched_clock();
+        cpu_clk = local_clock();
+        local_irq_restore(flags);
+        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
-        SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+        SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+        SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+        PN(ktime);
+        PN(sched_clk);
+        PN(cpu_clk);
+        P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+        P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+        SEQ_printf(m, "\n");
+        SEQ_printf(m, "sysctl_sched\n");
 #define P(x) \
        SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
        SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-        P(jiffies);
        PN(sysctl_sched_latency);
        PN(sysctl_sched_min_granularity);
        PN(sysctl_sched_wakeup_granularity);
@@ -414,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.statistics.wait_count);
        PN(se.statistics.iowait_sum);
        P(se.statistics.iowait_count);
-        P(sched_info.bkl_count);
        P(se.nr_migrations);
        P(se.statistics.nr_migrations_cold);
        P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e0e8d5ca3c98..334eb474af93 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,10 +22,11 @@
 #include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +53,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 /*
 * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 /*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-/*
 * SCHED_OTHER wake-up granularity.
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
@@ -89,6 +82,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -143,6 +143,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return cfs_rq->tg->cfs_rq[this_cpu];
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->on_list) {
+                /*
+                 * Ensure we either appear before our parent (if already
+                 * enqueued) or force our parent to appear after us when it is
+                 * enqueued.  The fact that we always enqueue bottom-up
+                 * reduces this to two cases.
+                 */
+                if (cfs_rq->tg->parent &&
+                    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                        list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                } else {
+                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                }
+                cfs_rq->on_list = 1;
+        }
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->on_list) {
+                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                cfs_rq->on_list = 0;
+        }
+}
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +276,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return &cpu_rq(this_cpu)->cfs;
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -320,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
        }
        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+        smp_wmb();
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 /*
@@ -374,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *left = cfs_rq->rb_leftmost;
@@ -384,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
        return rb_entry(left, struct sched_entity, run_node);
 }
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+        struct rb_node *next = rb_next(&se->run_node);
+        if (!next)
+                return NULL;
+        return rb_entry(next, struct sched_entity, run_node);
+}
+#ifdef CONFIG_SCHED_DEBUG
 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -398,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 * Scheduling class statistics methods:
 */
-#ifdef CONFIG_SCHED_DEBUG
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -417,7 +469,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
        WRT_SYSCTL(sched_min_granularity);
        WRT_SYSCTL(sched_latency);
        WRT_SYSCTL(sched_wakeup_granularity);
-        WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
        return 0;
@@ -495,6 +546,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -514,12 +568,16 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+        cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
 }
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
-        u64 now = rq_of(cfs_rq)->clock;
+        u64 now = rq_of(cfs_rq)->clock_task;
        unsigned long delta_exec;
        if (unlikely(!curr))
@@ -602,7 +660,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-        se->exec_start = rq_of(cfs_rq)->clock;
+        se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 /**************************************************
@@ -633,7 +691,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_add(&se->group_node, &cfs_rq->tasks);
        }
        cfs_rq->nr_running++;
-        se->on_rq = 1;
 }
 static void
@@ -647,9 +704,164 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_del_init(&se->group_node);
        }
        cfs_rq->nr_running--;
-        se->on_rq = 0;
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                            int global_update)
+{
+        struct task_group *tg = cfs_rq->tg;
+        long load_avg;
+        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+        load_avg -= cfs_rq->load_contribution;
+        if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+                atomic_add(load_avg, &tg->load_weight);
+                cfs_rq->load_contribution += load_avg;
+        }
+}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+        u64 period = sysctl_sched_shares_window;
+        u64 now, delta;
+        unsigned long load = cfs_rq->load.weight;
+        if (cfs_rq->tg == &root_task_group)
+                return;
+        now = rq_of(cfs_rq)->clock_task;
+        delta = now - cfs_rq->load_stamp;
+        /* truncate load history at 4 idle periods */
+        if (cfs_rq->load_stamp > cfs_rq->load_last &&
+            now - cfs_rq->load_last > 4 * period) {
+                cfs_rq->load_period = 0;
+                cfs_rq->load_avg = 0;
+                delta = period - 1;
+        }
+        cfs_rq->load_stamp = now;
+        cfs_rq->load_unacc_exec_time = 0;
+        cfs_rq->load_period += delta;
+        if (load) {
+                cfs_rq->load_last = now;
+                cfs_rq->load_avg += delta * load;
+        }
+        /* consider updating load contribution on each fold or truncate */
+        if (global_update || cfs_rq->load_period > period
+            || !cfs_rq->load_period)
+                update_cfs_rq_load_contribution(cfs_rq, global_update);
+        while (cfs_rq->load_period > period) {
+                /*
+                 * Inline assembly required to prevent the compiler
+                 * optimising this loop into a divmod call.
+                 * See __iter_div_u64_rem() for another example of this.
+                 */
+                asm("" : "+rm" (cfs_rq->load_period));
+                cfs_rq->load_period /= 2;
+                cfs_rq->load_avg /= 2;
+        }
+        if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+                list_del_leaf_cfs_rq(cfs_rq);
+}
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+        long load_weight, load, shares;
+        load = cfs_rq->load.weight;
+        load_weight = atomic_read(&tg->load_weight);
+        load_weight += load;
+        load_weight -= cfs_rq->load_contribution;
+        shares = (tg->shares * load);
+        if (load_weight)
+                shares /= load_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        if (shares > tg->shares)
+                shares = tg->shares;
+        return shares;
+}
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq);
+        }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+        return tg->shares;
+}
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                            unsigned long weight)
+{
+        if (se->on_rq) {
+                /* commit outstanding execution time */
+                if (cfs_rq->curr == se)
+                        update_curr(cfs_rq);
+                account_entity_dequeue(cfs_rq, se);
+        }
+        update_load_set(&se->load, weight);
+        if (se->on_rq)
+                account_entity_enqueue(cfs_rq, se);
+}
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg;
+        struct sched_entity *se;
+        long shares;
+        tg = cfs_rq->tg;
+        se = tg->se[cpu_of(rq_of(cfs_rq))];
+        if (!se)
+                return;
+#ifndef CONFIG_SMP
+        if (likely(se->load.weight == tg->shares))
+                return;
+#endif
+        shares = calc_cfs_shares(cfs_rq, tg);
+        reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,7 +983,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+        update_cfs_load(cfs_rq, 0);
        account_entity_enqueue(cfs_rq, se);
+        update_cfs_shares(cfs_rq);
        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
@@ -782,21 +996,55 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
+        se->on_rq = 1;
+        if (cfs_rq->nr_running == 1)
+                list_add_leaf_cfs_rq(cfs_rq);
+}
+static void __clear_buddies_last(struct sched_entity *se)
+{
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                if (cfs_rq->last == se)
+                        cfs_rq->last = NULL;
+                else
+                        break;
+        }
 }
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_next(struct sched_entity *se)
 {
-        if (!se || cfs_rq->last == se)
+        for_each_sched_entity(se) {
-                cfs_rq->last = NULL;
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                if (cfs_rq->next == se)
+                        cfs_rq->next = NULL;
+                else
+                        break;
+        }
+}
-        if (!se || cfs_rq->next == se)
+static void __clear_buddies_skip(struct sched_entity *se)
-                cfs_rq->next = NULL;
+{
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                if (cfs_rq->skip == se)
+                        cfs_rq->skip = NULL;
+                else
+                        break;
+        }
 }
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        for_each_sched_entity(se)
+        if (cfs_rq->last == se)
-                __clear_buddies(cfs_rq_of(se), se);
+                __clear_buddies_last(se);
+        if (cfs_rq->next == se)
+                __clear_buddies_next(se);
+        if (cfs_rq->skip == se)
+                __clear_buddies_skip(se);
 }
 static void
@@ -825,8 +1073,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
+        se->on_rq = 0;
+        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
-        update_min_vruntime(cfs_rq);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -835,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         */
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        update_min_vruntime(cfs_rq);
+        update_cfs_shares(cfs_rq);
 }
 /*
@@ -869,9 +1121,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                return;
        if (cfs_rq->nr_running > 1) {
-                struct sched_entity *se = __pick_next_entity(cfs_rq);
+                struct sched_entity *se = __pick_first_entity(cfs_rq);
                s64 delta = curr->vruntime - se->vruntime;
+                if (delta < 0)
+                        return;
                if (delta > ideal_runtime)
                        resched_task(rq_of(cfs_rq)->curr);
        }
@@ -910,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        struct sched_entity *se = __pick_first_entity(cfs_rq);
        struct sched_entity *left = se;
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+        /*
-                se = cfs_rq->next;
+         * Avoid running the skip buddy, if running something else can
+         * be done without getting too unfair.
+         */
+        if (cfs_rq->skip == se) {
+                struct sched_entity *second = __pick_next_entity(se);
+                if (second && wakeup_preempt_entity(second, left) < 1)
+                        se = second;
+        }
        /*
         * Prefer last buddy, try to return the CPU to a preempted task.
@@ -924,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
                se = cfs_rq->last;
+        /*
+         * Someone really wants this to run. If it's not unfair, run it.
+         */
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+                se = cfs_rq->next;
        clear_buddies(cfs_rq, se);
        return se;
@@ -955,6 +1230,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_curr(cfs_rq);
+        /*
+         * Update share accounting for long-running entities.
+         */
+        update_entity_shares_tick(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,9 +1335,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                flags = ENQUEUE_WAKEUP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq);
+        }
        hrtick_update(rq);
 }
+static void set_next_buddy(struct sched_entity *se);
 /*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
@@ -1067,73 +1356,56 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        int task_sleep = flags & DEQUEUE_SLEEP;
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
-                if (cfs_rq->load.weight)
+                if (cfs_rq->load.weight) {
+                        /*
+                         * Bias pick_next to pick a task from this cfs_rq, as
+                         * p is sleeping when it is within its sched_slice.
+                         */
+                        if (task_sleep && parent_entity(se))
+                                set_next_buddy(parent_entity(se));
                        break;
+                }
                flags |= DEQUEUE_SLEEP;
        }
-        hrtick_update(rq);
+        for_each_sched_entity(se) {
-}
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
-        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-        struct sched_entity *rightmost, *se = &curr->se;
-        /*
-         * Are we the only task in the tree?
-         */
-        if (unlikely(cfs_rq->nr_running == 1))
-                return;
-        clear_buddies(cfs_rq, se);
-        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-                update_rq_clock(rq);
-                /*
-                 * Update run-time statistics of the 'current'.
-                 */
-                update_curr(cfs_rq);
-                return;
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq);
        }
-        /*
-         * Find the rightmost entry in the rbtree:
-         */
-        rightmost = __pick_last_entity(cfs_rq);
-        /*
-         * Already in the rightmost position?
-         */
-        if (unlikely(!rightmost || entity_before(rightmost, se)))
-                return;
-        /*
+        hrtick_update(rq);
-         * Minimally necessary key value to be last in the tree:
-         * Upon rescheduling, sched_class::put_prev_task() will place
-         * 'current' within the tree based on its new key value.
-         */
-        se->vruntime = rightmost->vruntime + 1;
 }
 #ifdef CONFIG_SMP
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        u64 min_vruntime;
-        se->vruntime -= cfs_rq->min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+        do {
+                min_vruntime_copy = cfs_rq->min_vruntime_copy;
+                smp_rmb();
+                min_vruntime = cfs_rq->min_vruntime;
+        } while (min_vruntime != min_vruntime_copy);
+#else
+        min_vruntime = cfs_rq->min_vruntime;
+#endif
+        se->vruntime -= min_vruntime;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1143,67 +1415,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
 */
-static long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!tg->parent)
                return wl;
-        /*
-         * By not taking the decrease of shares on the other cpu into
-         * account our error leans towards reducing the affine wakeups.
-         */
-        if (!wl && sched_feat(ASYM_EFF_LOAD))
-                return wl;
        for_each_sched_entity(se) {
-                long S, rw, s, a, b;
+                long lw, w;
-                long more_w;
-                /*
+                tg = se->my_q->tg;
-                 * Instead of using this increment, also add the difference
+                w = se->my_q->load.weight;
-                 * between when the shares were last updated and now.
-                 */
-                more_w = se->my_q->load.weight - se->my_q->rq_weight;
-                wl += more_w;
-                wg += more_w;
-                S = se->my_q->tg->shares;
+                /* use this cpu's instantaneous contribution */
-                s = se->my_q->shares;
+                lw = atomic_read(&tg->load_weight);
-                rw = se->my_q->rq_weight;
+                lw -= se->my_q->load_contribution;
+                lw += w + wg;
-                a = S*(rw + wl);
+                wl += w;
-                b = S*rw + s*wg;
-                wl = s*(a-b);
+                if (lw > 0 && wl < lw)
+                        wl = (wl * tg->shares) / lw;
-                if (likely(b))
+                else
-                        wl /= b;
+                        wl = tg->shares;
-                /*
+                /* zero point is MIN_SHARES */
-                 * Assume the group is already running and will
+                if (wl < MIN_SHARES)
-                 * thus already be accounted for in the weight.
+                        wl = MIN_SHARES;
-                 *
+                wl -= se->load.weight;
-                 * That is, moving shares between CPUs, does not
-                 * alter the group weight.
-                 */
                wg = 0;
        }
@@ -1222,7 +1463,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-        unsigned long this_load, load;
+        s64 this_load, load;
        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
        struct task_group *tg;
@@ -1261,8 +1502,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        if (this_load) {
+        if (this_load > 0) {
-                unsigned long this_eff_load, prev_eff_load;
+                s64 this_eff_load, prev_eff_load;
                this_eff_load = 100;
                this_eff_load *= power_of(prev_cpu);
@@ -1344,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                }
                /* Adjust by relative CPU power of the group */
-                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
                if (local_group) {
                        this_load = avg_load;
@@ -1409,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
+        rcu_read_lock();
        for_each_domain(target, sd) {
                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
                        break;
@@ -1428,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
                        break;
        }
+        rcu_read_unlock();
        return target;
 }
@@ -1444,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
 * preempt must be disabled.
 */
 static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
@@ -1460,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                new_cpu = prev_cpu;
        }
+        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                if (!(tmp->flags & SD_LOAD_BALANCE))
                        continue;
@@ -1479,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                                nr_running += cpu_rq(i)->cfs.nr_running;
                        }
-                        capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                        capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
                                nr_running /= 2;
@@ -1508,28 +1752,12 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                        sd = tmp;
        }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (sched_feat(LB_SHARES_UPDATE)) {
-                /*
-                 * Pick the largest domain to update shares over
-                 */
-                tmp = sd;
-                if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                        tmp = affine_sd;
-                if (tmp) {
-                        raw_spin_unlock(&rq->lock);
-                        update_shares(tmp);
-                        raw_spin_lock(&rq->lock);
-                }
-        }
-#endif
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-                        return select_idle_sibling(p, cpu);
+                        prev_cpu = cpu;
-                else
-                        return select_idle_sibling(p, prev_cpu);
+                new_cpu = select_idle_sibling(p, prev_cpu);
+                goto unlock;
        }
        while (sd) {
@@ -1570,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                }
                /* while loop will break here if sd == NULL */
        }
+unlock:
+        rcu_read_unlock();
        return new_cpu;
 }
@@ -1593,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
         * This is especially important for buddies when the leftmost
         * task is higher priority than the buddy.
         */
-        if (unlikely(se->load.weight != NICE_0_LOAD))
+        return calc_delta_fair(gran, se);
-                gran = calc_delta_fair(gran, se);
-        return gran;
 }
 /*
@@ -1630,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 static void set_last_buddy(struct sched_entity *se)
 {
-        if (likely(task_of(se)->policy != SCHED_IDLE)) {
+        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-                for_each_sched_entity(se)
+                return;
-                        cfs_rq_of(se)->last = se;
-        }
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->last = se;
 }
 static void set_next_buddy(struct sched_entity *se)
 {
-        if (likely(task_of(se)->policy != SCHED_IDLE)) {
+        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-                for_each_sched_entity(se)
+                return;
-                        cfs_rq_of(se)->next = se;
-        }
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->next = se;
+}
+static void set_skip_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->skip = se;
 }
 /*
@@ -1653,18 +1888,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
+        int next_buddy_marked = 0;
        if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
                goto preempt;
-        if (unlikely(p->sched_class != &fair_sched_class))
-                return;
        if (unlikely(se == pse))
                return;
-        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
+        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
                set_next_buddy(pse);
+                next_buddy_marked = 1;
+        }
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1673,16 +1908,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (test_tsk_need_resched(curr))
                return;
+        /* Idle tasks are by definition preempted by non-idle tasks. */
+        if (unlikely(curr->policy == SCHED_IDLE) &&
+            likely(p->policy != SCHED_IDLE))
+                goto preempt;
        /*
-         * Batch and idle tasks do not preempt (their preemption is driven by
+         * Batch and idle tasks do not preempt non-idle tasks (their preemption
-         * the tick):
+         * is driven by the tick):
         */
        if (unlikely(p->policy != SCHED_NORMAL))
                return;
-        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE))
-                goto preempt;
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
@@ -1690,8 +1927,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
-        if (wakeup_preempt_entity(se, pse) == 1)
+        if (wakeup_preempt_entity(se, pse) == 1) {
+                /*
+                 * Bias pick_next to pick the sched entity that is
+                 * triggering this preemption.
+                 */
+                if (!next_buddy_marked)
+                        set_next_buddy(pse);
                goto preempt;
+        }
        return;
@@ -1748,6 +1992,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
        }
 }
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        struct sched_entity *se = &curr->se;
+        /*
+         * Are we the only task in the tree?
+         */
+        if (unlikely(rq->nr_running == 1))
+                return;
+        clear_buddies(cfs_rq, se);
+        if (curr->policy != SCHED_BATCH) {
+                update_rq_clock(rq);
+                /*
+                 * Update run-time statistics of the 'current'.
+                 */
+                update_curr(cfs_rq);
+        }
+        set_skip_buddy(se);
+}
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+        struct sched_entity *se = &p->se;
+        if (!se->on_rq)
+                return false;
+        /* Tell the scheduler that we'd really like pse to run next. */
+        set_next_buddy(se);
+        yield_task_fair(rq);
+        return true;
+}
 #ifdef CONFIG_SMP
 /**************************************************
 * Fair scheduling class load-balancing methods:
@@ -1798,7 +2087,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
+        tsk_cache_hot = task_hot(p, rq->clock_task, sd);
        if (!tsk_cache_hot ||
                sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -1857,23 +2146,22 @@ static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              unsigned long max_load_move, struct sched_domain *sd,
              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
+              struct cfs_rq *busiest_cfs_rq)
 {
-        int loops = 0, pulled = 0, pinned = 0;
+        int loops = 0, pulled = 0;
        long rem_load_move = max_load_move;
        struct task_struct *p, *n;
        if (max_load_move == 0)
                goto out;
-        pinned = 1;
        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
                if (loops++ > sysctl_sched_nr_migrate)
                        break;
                if ((p->se.load.weight >> 1) > rem_load_move ||
-                    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+                                      all_pinned))
                        continue;
                pull_task(busiest, p, this_rq, this_cpu);
@@ -1896,9 +2184,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 */
                if (rem_load_move <= 0)
                        break;
-                if (p->prio < *this_best_prio)
-                        *this_best_prio = p->prio;
        }
 out:
        /*
@@ -1908,18 +2193,57 @@ out:
         */
        schedstat_add(sd, lb_gained[idle], pulled);
-        if (all_pinned)
-                *all_pinned = pinned;
        return max_load_move - rem_load_move;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        unsigned long flags;
+        struct rq *rq;
+        if (!tg->se[cpu])
+                return 0;
+        rq = cpu_rq(cpu);
+        cfs_rq = tg->cfs_rq[cpu];
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        update_rq_clock(rq);
+        update_cfs_load(cfs_rq, 1);
+        /*
+         * We need to update shares after updating tg->load_weight in
+         * order to adjust the weight of groups with long running tasks.
+         */
+        update_cfs_shares(cfs_rq);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        return 0;
+}
+static void update_shares(int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        struct rq *rq = cpu_rq(cpu);
+        rcu_read_lock();
+        for_each_leaf_cfs_rq(rq, cfs_rq)
+                update_shares_cpu(cfs_rq->tg, cpu);
+        rcu_read_unlock();
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
+                  int *all_pinned)
 {
        long rem_load_move = max_load_move;
        int busiest_cpu = cpu_of(busiest);
@@ -1944,7 +2268,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = div_u64(rem_load, busiest_h_load + 1);
                moved_load = balance_tasks(this_rq, this_cpu, busiest,
-                                rem_load, sd, idle, all_pinned, this_best_prio,
+                                rem_load, sd, idle, all_pinned,
                                busiest_cfs_rq);
                if (!moved_load)
@@ -1962,15 +2286,19 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
+                  int *all_pinned)
 {
        return balance_tasks(this_rq, this_cpu, busiest,
                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &busiest->cfs);
+                        &busiest->cfs);
 }
 #endif
@@ -1987,12 +2315,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      int *all_pinned)
 {
        unsigned long total_load_moved = 0, load_moved;
-        int this_best_prio = this_rq->curr->prio;
        do {
                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned, &this_best_prio);
+                                sd, idle, all_pinned);
                total_load_moved += load_moved;
@@ -2030,12 +2357,17 @@ struct sd_lb_stats {
        unsigned long this_load;
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
+        unsigned long this_has_capacity;
+        unsigned int  this_idle_cpus;
        /* Statistics of the busiest group */
+        unsigned int  busiest_idle_cpus;
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
+        unsigned long busiest_has_capacity;
+        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2057,7 +2389,10 @@ struct sg_lb_stats {
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
+        unsigned long idle_cpus;
+        unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
+        int group_has_capacity; /* Is there extra capacity in the group? */
 };
 /**
@@ -2239,7 +2574,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
-        return SCHED_LOAD_SCALE;
+        return SCHED_POWER_SCALE;
 }
 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2268,12 +2603,18 @@ unsigned long scale_rt_power(int cpu)
        u64 total, available;
        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
-        if (unlikely((s64)total < SCHED_LOAD_SCALE))
+        if (unlikely(total < rq->rt_avg)) {
-                total = SCHED_LOAD_SCALE;
+                /* Ensures that power won't end up being negative */
+                available = 0;
+        } else {
+                available = total - rq->rt_avg;
+        }
+        if (unlikely((s64)total < SCHED_POWER_SCALE))
+                total = SCHED_POWER_SCALE;
-        total >>= SCHED_LOAD_SHIFT;
+        total >>= SCHED_POWER_SHIFT;
        return div_u64(available, total);
 }
@@ -2281,7 +2622,7 @@ unsigned long scale_rt_power(int cpu)
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
        unsigned long weight = sd->span_weight;
-        unsigned long power = SCHED_LOAD_SCALE;
+        unsigned long power = SCHED_POWER_SCALE;
        struct sched_group *sdg = sd->groups;
        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2290,26 +2631,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
                else
                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_LOAD_SHIFT;
+                power >>= SCHED_POWER_SHIFT;
        }
-        sdg->cpu_power_orig = power;
+        sdg->sgp->power_orig = power;
        if (sched_feat(ARCH_POWER))
                power *= arch_scale_freq_power(sd, cpu);
        else
                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
+        power >>= SCHED_POWER_SHIFT;
        power *= scale_rt_power(cpu);
-        power >>= SCHED_LOAD_SHIFT;
+        power >>= SCHED_POWER_SHIFT;
        if (!power)
                power = 1;
        cpu_rq(cpu)->cpu_power = power;
-        sdg->cpu_power = power;
+        sdg->sgp->power = power;
 }
 static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2327,11 +2668,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
        group = child->groups;
        do {
-                power += group->cpu_power;
+                power += group->sgp->power;
                group = group->next;
        } while (group != child->groups);
-        sdg->cpu_power = power;
+        sdg->sgp->power = power;
 }
 /*
@@ -2345,15 +2686,15 @@ static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
        /*
-         * Only siblings can have significantly less than SCHED_LOAD_SCALE
+         * Only siblings can have significantly less than SCHED_POWER_SCALE
         */
-        if (sd->level != SD_LV_SIBLING)
+        if (!(sd->flags & SD_SHARE_CPUPOWER))
                return 0;
        /*
         * If ~90% of the cpu_power is still there, we're good.
         */
-        if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+        if (group->sgp->power * 32 > group->sgp->power_orig * 29)
                return 1;
        return 0;
@@ -2366,7 +2707,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
 * @local_group: Does group contain this_cpu.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
@@ -2374,11 +2714,11 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 */
 static inline void update_sg_lb_stats(struct sched_domain *sd,
                        struct sched_group *group, int this_cpu,
-                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        enum cpu_idle_type idle, int load_idx,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
        int i;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
@@ -2389,13 +2729,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
+        max_nr_running = 0;
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
-                if (*sd_idle && rq->nr_running)
-                        *sd_idle = 0;
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
                        if (idle_cpu(i) && !first_idle_cpu) {
@@ -2406,8 +2744,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
+                        if (load > max_cpu_load) {
                                max_cpu_load = load;
+                                max_nr_running = rq->nr_running;
+                        }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
                }
@@ -2415,7 +2755,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
+                if (idle_cpu(i))
+                        sgs->idle_cpus++;
        }
        /*
@@ -2433,11 +2774,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        }
        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
        /*
         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of two tasks.
+         * than the average weight of a task.
         *
         * APZ: with cgroup the avg task weight can vary wildly and
         *      might not be a suitable number - should we keep a
@@ -2447,13 +2788,17 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
-        sgs->group_capacity =
+        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+                                                SCHED_POWER_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        sgs->group_weight = group->group_weight;
+        if (sgs->group_capacity > sgs->sum_nr_running)
+                sgs->group_has_capacity = 1;
 }
 /**
@@ -2504,15 +2849,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                        enum cpu_idle_type idle, int *sd_idle,
+                        enum cpu_idle_type idle, const struct cpumask *cpus,
-                        const struct cpumask *cpus, int *balance,
+                        int *balance, struct sd_lb_stats *sds)
-                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *sg = sd->groups;
@@ -2530,21 +2873,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
                                local_group, cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
                sds->total_load += sgs.group_load;
-                sds->total_pwr += sg->cpu_power;
+                sds->total_pwr += sg->sgp->power;
                /*
                 * In case the child domain prefers tasks go to siblings
                 * first, lower the sg capacity to one so that we'll try
-                 * and move all the excess tasks away.
+                 * and move all the excess tasks away. We lower the capacity
+                 * of a group only if the local group has the capacity to fit
+                 * these excess tasks, i.e. nr_running < group_capacity. The
+                 * extra check prevents the case where you always pull from the
+                 * heaviest group when it is already under-utilized (possible
+                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling)
+                if (prefer_sibling && !local_group && sds->this_has_capacity)
                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
                if (local_group) {
@@ -2552,12 +2900,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this = sg;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
+                        sds->this_has_capacity = sgs.group_has_capacity;
+                        sds->this_idle_cpus = sgs.idle_cpus;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_idle_cpus = sgs.idle_cpus;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->busiest_has_capacity = sgs.group_has_capacity;
+                        sds->busiest_group_weight = sgs.group_weight;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2612,8 +2965,8 @@ static int check_asym_packing(struct sched_domain *sd,
        if (this_cpu > busiest_cpu)
                return 0;
-        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
-                                       SCHED_LOAD_SCALE);
+                                       SCHED_POWER_SCALE);
        return 1;
 }
@@ -2642,8 +2995,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
                        cpu_avg_load_per_task(this_cpu);
        scaled_busy_load_per_task = sds->busiest_load_per_task
-                                                 * SCHED_LOAD_SCALE;
+                                         * SCHED_POWER_SCALE;
-        scaled_busy_load_per_task /= sds->busiest->cpu_power;
+        scaled_busy_load_per_task /= sds->busiest->sgp->power;
        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
                        (scaled_busy_load_per_task * imbn)) {
@@ -2657,30 +3010,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
         * moving them.
         */
-        pwr_now += sds->busiest->cpu_power *
+        pwr_now += sds->busiest->sgp->power *
                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->cpu_power *
+        pwr_now += sds->this->sgp->power *
                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_LOAD_SCALE;
+        pwr_now /= SCHED_POWER_SCALE;
        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+        tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                sds->busiest->cpu_power;
+                sds->busiest->sgp->power;
        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->cpu_power *
+                pwr_move += sds->busiest->sgp->power *
                        min(sds->busiest_load_per_task, sds->max_load - tmp);
        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->cpu_power <
+        if (sds->max_load * sds->busiest->sgp->power <
-                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                sds->busiest_load_per_task * SCHED_POWER_SCALE)
-                tmp = (sds->max_load * sds->busiest->cpu_power) /
+                tmp = (sds->max_load * sds->busiest->sgp->power) /
-                        sds->this->cpu_power;
+                        sds->this->sgp->power;
        else
-                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                        sds->this->cpu_power;
+                        sds->this->sgp->power;
-        pwr_move += sds->this->cpu_power *
+        pwr_move += sds->this->sgp->power *
                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_LOAD_SCALE;
+        pwr_move /= SCHED_POWER_SCALE;
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
@@ -2722,9 +3075,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                load_above_capacity = (sds->busiest_nr_running -
                                                sds->busiest_group_capacity);
-                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-                load_above_capacity /= sds->busiest->cpu_power;
+                load_above_capacity /= sds->busiest->sgp->power;
        }
        /*
@@ -2740,13 +3093,13 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->cpu_power,
+        *imbalance = min(max_pull * sds->busiest->sgp->power,
-                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
-                        / SCHED_LOAD_SCALE;
+                        / SCHED_POWER_SCALE;
        /*
         * if *imbalance is less than the average load per runnable task
-         * there is no gaurantee that any tasks will be moved so we'll have
+         * there is no guarantee that any tasks will be moved so we'll have
         * a think about bumping its value to force at least one task to be
         * moved
         */
@@ -2754,6 +3107,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                return fix_small_imbalance(sds, this_cpu, imbalance);
 }
 /******* find_busiest_group() helpers end here *********************/
 /**
@@ -2771,7 +3125,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 * @imbalance: Variable which stores amount of weighted load which should
 *              be moved to restore balance/put a group to idle.
 * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
@@ -2784,7 +3137,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
+                   const struct cpumask *cpus, int *balance)
 {
        struct sd_lb_stats sds;
@@ -2794,17 +3147,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+        update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
-                                        balance, &sds);
-        /* Cases where imbalance does not exist from POV of this_cpu */
+        /*
-        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         * this_cpu is not the appropriate cpu to perform load balancing at
-         *    at this level.
+         * this level.
-         * 2) There is no busy sibling group to pull from.
-         * 3) This group is the busiest group.
-         * 4) This group is more busy than the avg busieness at this
-         *    sched_domain.
-         * 5) The imbalance is within the specified limit.
         */
        if (!(*balance))
                goto ret;
@@ -2813,20 +3160,59 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
            check_asym_packing(sd, &sds, this_cpu, imbalance))
                return sds.busiest;
+        /* There is no busy sibling group to pull tasks from */
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
+        sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+        /*
+         * If the busiest group is imbalanced the below checks don't
+         * work because they assumes all things are equal, which typically
+         * isn't true due to cpus_allowed constraints and the like.
+         */
+        if (sds.group_imb)
+                goto force_balance;
+        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                        !sds.busiest_has_capacity)
+                goto force_balance;
+        /*
+         * If the local group is more busy than the selected busiest group
+         * don't try and pull any tasks.
+         */
        if (sds.this_load >= sds.max_load)
                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        /*
+         * Don't pull any tasks if this group is already above the domain
+         * average load.
+         */
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+        if (idle == CPU_IDLE) {
-                goto out_balanced;
+                /*
+                 * This cpu is idle. If the busiest group load doesn't
+                 * have more tasks than the number of available cpu's and
+                 * there is no imbalance between this and busiest group
+                 * wrt to idle cpu's, it is balanced.
+                 */
+                if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                        goto out_balanced;
+        } else {
+                /*
+                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+                 * imbalance_pct to be conservative.
+                 */
+                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                        goto out_balanced;
+        }
+force_balance:
        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(&sds, this_cpu, imbalance);
        return sds.busiest;
@@ -2857,7 +3243,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
        for_each_cpu(i, sched_group_cpus(group)) {
                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                                                           SCHED_POWER_SCALE);
                unsigned long wl;
                if (!capacity)
@@ -2882,7 +3269,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
                 * the load can be moved away from the cpu that is potentially
                 * running at a lower capacity.
                 */
-                wl = (wl * SCHED_LOAD_SCALE) / power;
+                wl = (wl * SCHED_POWER_SCALE) / power;
                if (wl > max_load) {
                        max_load = wl;
@@ -2902,7 +3289,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
                               int busiest_cpu, int this_cpu)
 {
        if (idle == CPU_NEWLY_IDLE) {
@@ -2934,10 +3321,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
                 * move_tasks() will succeed.  ld_moved will be true and this
                 * active balance code will not be triggered.
                 */
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                        return 0;
                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
                        return 0;
        }
@@ -2955,7 +3338,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        int ld_moved, all_pinned = 0, active_balance = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
@@ -2964,21 +3347,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        update_shares(sd);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle,
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
        if (*balance == 0)
@@ -3007,6 +3379,7 @@ redo:
                 * still unbalanced. ld_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
+                all_pinned = 1;
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3031,10 +3404,16 @@ redo:
        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
+                /*
+                 * Increment the failure counter only on periodic balance.
+                 * We do not want newidle balance, which can be very
+                 * frequent, pollute the failure counter causing
+                 * excessive cache_hot migrations and active balances.
+                 */
+                if (idle != CPU_NEWLY_IDLE)
+                        sd->nr_balance_failed++;
-                if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+                if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
-                                        this_cpu)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the active_load_balance_cpu_stop,
@@ -3089,10 +3468,6 @@ redo:
                        sd->balance_interval *= 2;
        }
-        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
        goto out;
 out_balanced:
@@ -3106,14 +3481,8 @@ out_one_pinned:
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+        ld_moved = 0;
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        else
-                ld_moved = 0;
 out:
-        if (ld_moved)
-                update_shares(sd);
        return ld_moved;
 }
@@ -3137,6 +3506,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         */
        raw_spin_unlock(&this_rq->lock);
+        update_shares(this_cpu);
+        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3158,6 +3529,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                        break;
                }
        }
+        rcu_read_unlock();
        raw_spin_lock(&this_rq->lock);
@@ -3206,6 +3578,7 @@ static int active_load_balance_cpu_stop(void *data)
        double_lock_balance(busiest_rq, target_rq);
        /* Search for an sd spanning us and the target CPU. */
+        rcu_read_lock();
        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3221,6 +3594,7 @@ static int active_load_balance_cpu_stop(void *data)
                else
                        schedstat_inc(sd, alb_failed);
        }
+        rcu_read_unlock();
        double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
        busiest_rq->active_balance = 0;
@@ -3347,6 +3721,7 @@ static int find_new_ilb(int cpu)
 {
        struct sched_domain *sd;
        struct sched_group *ilb_group;
+        int ilb = nr_cpu_ids;
        /*
         * Have idle load balancer selection from semi-idle packages only
@@ -3362,20 +3737,25 @@ static int find_new_ilb(int cpu)
        if (cpumask_weight(nohz.idle_cpus_mask) < 2)
                goto out_done;
+        rcu_read_lock();
        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
                ilb_group = sd->groups;
                do {
-                        if (is_semi_idle_group(ilb_group))
+                        if (is_semi_idle_group(ilb_group)) {
-                                return cpumask_first(nohz.grp_idle_mask);
+                                ilb = cpumask_first(nohz.grp_idle_mask);
+                                goto unlock;
+                        }
                        ilb_group = ilb_group->next;
                } while (ilb_group != sd->groups);
        }
+unlock:
+        rcu_read_unlock();
 out_done:
-        return nr_cpu_ids;
+        return ilb;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
@@ -3490,6 +3870,17 @@ void select_nohz_load_balancer(int stop_tick)
 static DEFINE_SPINLOCK(balancing);
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+        max_load_balance_interval = HZ*num_online_cpus()/10;
+}
 /*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
@@ -3507,6 +3898,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
+        update_shares(cpu);
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
@@ -3517,10 +3911,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                /* scale ms to jiffies */
                interval = msecs_to_jiffies(interval);
-                if (unlikely(!interval))
+                interval = clamp(interval, 1UL, max_load_balance_interval);
-                        interval = 1;
-                if (interval > HZ*NR_CPUS/10)
-                        interval = HZ*NR_CPUS/10;
                need_serialize = sd->flags & SD_SERIALIZE;
@@ -3533,8 +3924,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                        if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
-                                 * longer idle, or one of our SMT siblings is
+                                 * longer idle.
-                                 * not idle.
                                 */
                                idle = CPU_NOT_IDLE;
                        }
@@ -3556,6 +3946,7 @@ out:
                if (!balance)
                        break;
        }
+        rcu_read_unlock();
        /*
         * next_balance will be updated only when there is a need.
@@ -3751,8 +4142,11 @@ static void task_fork_fair(struct task_struct *p)
        update_rq_clock(rq);
-        if (unlikely(task_cpu(p) != this_cpu))
+        if (unlikely(task_cpu(p) != this_cpu)) {
+                rcu_read_lock();
                __set_task_cpu(p, this_cpu);
+                rcu_read_unlock();
+        }
        update_curr(cfs_rq);
@@ -3778,33 +4172,62 @@ static void task_fork_fair(struct task_struct *p)
 * Priority of the task has changed. Check to see if we preempt
 * the current task.
 */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
+static void
-                              int oldprio, int running)
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
+        if (!p->se.on_rq)
+                return;
        /*
         * Reschedule if we are currently running on this runqueue and
         * our priority decreased, or if we are not currently running on
         * this runqueue and our priority is higher than the current's
         */
-        if (running) {
+        if (rq->curr == p) {
                if (p->prio > oldprio)
                        resched_task(rq->curr);
        } else
                check_preempt_curr(rq, p, 0);
 }
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        /*
+         * Ensure the task's vruntime is normalized, so that when its
+         * switched back to the fair class the enqueue_entity(.flags=0) will
+         * do the right thing.
+         *
+         * If it was on_rq, then the dequeue_entity(.flags=0) will already
+         * have normalized the vruntime, if it was !on_rq, then only when
+         * the task is sleeping will it still have non-normalized vruntime.
+         */
+        if (!se->on_rq && p->state != TASK_RUNNING) {
+                /*
+                 * Fix up our vruntime so that the current sleep doesn't
+                 * cause 'unlimited' sleep bonus.
+                 */
+                place_entity(cfs_rq, se, 0);
+                se->vruntime -= cfs_rq->min_vruntime;
+        }
+}
 /*
 * We switched to the sched_fair class.
 */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
-                             int running)
 {
+        if (!p->se.on_rq)
+                return;
        /*
         * We were most likely switched from sched_rt, so
         * kick off the schedule if running, otherwise just see
         * if we can still preempt the current task.
         */
-        if (running)
+        if (rq->curr == p)
                resched_task(rq->curr);
        else
                check_preempt_curr(rq, p, 0);
@@ -3824,13 +4247,26 @@ static void set_curr_task_fair(struct rq *rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        /*
+         * If the task was not on the rq at the time of this cgroup movement
-        update_curr(cfs_rq);
+         * it must have been asleep, sleeping tasks keep their ->vruntime
+         * absolute on their old rq until wakeup (needed for the fair sleeper
+         * bonus in place_entity()).
+         *
+         * If it was on the rq, we've just 'preempted' it, which does convert
+         * ->vruntime to a relative base.
+         *
+         * Make sure both cases convert their relative position when migrating
+         * to another cgroup's rq. This does somewhat interfere with the
+         * fair sleeper stuff for the first placement, but who cares.
+         */
+        if (!on_rq)
+                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+        set_task_rq(p, task_cpu(p));
        if (!on_rq)
-                place_entity(cfs_rq, &p->se, 1);
+                p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
@@ -3857,6 +4293,7 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
+        .yield_to_task          = yield_to_task_fair,
        .check_preempt_curr     = check_preempt_wakeup,
@@ -3877,12 +4314,13 @@ static const struct sched_class fair_sched_class = {
        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
+        .switched_from          = switched_from_fair,
        .switched_to            = switched_to_fair,
        .get_rr_interval        = get_rr_interval_fair,
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        .moved_group            = moved_group_fair,
+        .task_move_group        = task_move_group_fair,
 #endif
 };
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..1e7066d76c26 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
 /*
 * Spin-wait on mutex acquisition when the mutex owner is running on
@@ -61,3 +59,16 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
 * release the lock. Decreases scheduling overhead.
 */
 SCHED_FEAT(OWNER_SPIN, 1)
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, 1)
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
-                             int running)
 {
-        /* Can this actually happen?? */
+        BUG();
-        if (running)
-                resched_task(rq->curr);
-        else
-                check_preempt_curr(rq, p, 0);
 }
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+static void
-                              int oldprio, int running)
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        /* This can happen for hot plug CPUS */
+        BUG();
-        /*
-         * Reschedule if we are currently running on this runqueue and
-         * our priority decreased, or if we are not currently running on
-         * this runqueue and our priority is higher than the current's
-         */
-        if (running) {
-                if (p->prio > oldprio)
-                        resched_task(rq->curr);
-        } else
-                check_preempt_curr(rq, p, 0);
 }
 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
        .prio_changed           = prio_changed_idle,
        .switched_to            = switched_to_idle,
-        /* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e40e7fe43170..58cf5d18dfdc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,25 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
+typedef struct task_group *rt_rq_iter_t;
+#define for_each_rt_rq(rt_rq, iter, rq) \
+        for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
+             (&iter->list != &task_groups) && \
+             (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+             iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+        list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+        list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@@ -199,11 +218,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-        int this_cpu = smp_processor_id();
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
        struct sched_rt_entity *rt_se;
-        rt_se = rt_rq->tg->rt_se[this_cpu];
+        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+        rt_se = rt_rq->tg->rt_se[cpu];
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
@@ -215,10 +235,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-        int this_cpu = smp_processor_id();
        struct sched_rt_entity *rt_se;
+        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-        rt_se = rt_rq->tg->rt_se[this_cpu];
+        rt_se = rt_rq->tg->rt_se[cpu];
        if (rt_se && on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
@@ -276,6 +296,19 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
+typedef struct rt_rq *rt_rq_iter_t;
+#define for_each_rt_rq(rt_rq, iter, rq) \
+        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@@ -382,12 +415,13 @@ next:
 static void __disable_runtime(struct rq *rq)
 {
        struct root_domain *rd = rq->rd;
+        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
        if (unlikely(!scheduler_running))
                return;
-        for_each_leaf_rt_rq(rt_rq, rq) {
+        for_each_rt_rq(rt_rq, iter, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
                s64 want;
                int i;
@@ -467,6 +501,7 @@ static void disable_runtime(struct rq *rq)
 static void __enable_runtime(struct rq *rq)
 {
+        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
        if (unlikely(!scheduler_running))
@@ -475,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
        /*
         * Reset each runqueue's bandwidth settings
         */
-        for_each_leaf_rt_rq(rt_rq, rq) {
+        for_each_rt_rq(rt_rq, iter, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
                raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -542,12 +577,22 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                                rt_rq->rt_throttled = 0;
                                enqueue = 1;
+                                /*
+                                 * Force a clock update if the CPU was idle,
+                                 * lest wakeup -> unthrottle time accumulate.
+                                 */
+                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+                                        rq->skip_clock_update = -1;
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                } else if (rt_rq->rt_nr_running)
+                } else if (rt_rq->rt_nr_running) {
                        idle = 0;
+                        if (!rt_rq_throttled(rt_rq))
+                                enqueue = 1;
+                }
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
@@ -606,10 +651,10 @@ static void update_curr_rt(struct rq *rq)
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        u64 delta_exec;
-        if (!task_has_rt_policy(curr))
+        if (curr->sched_class != &rt_sched_class)
                return;
-        delta_exec = rq->clock - curr->se.exec_start;
+        delta_exec = rq->clock_task - curr->se.exec_start;
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
@@ -618,7 +663,7 @@ static void update_curr_rt(struct rq *rq)
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock;
+        curr->se.exec_start = rq->clock_task;
        cpuacct_charge(curr, delta_exec);
        sched_rt_avg_update(rq, delta_exec);
@@ -825,6 +870,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
+        if (!rt_rq->rt_nr_running)
+                list_add_leaf_rt_rq(rt_rq);
        if (head)
                list_add(&rt_se->run_list, queue);
        else
@@ -844,6 +892,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                __clear_bit(rt_se_prio(rt_se), array->bitmap);
        dec_rt_tasks(rt_se, rt_rq);
+        if (!rt_rq->rt_nr_running)
+                list_del_leaf_rt_rq(rt_rq);
 }
 /*
@@ -949,40 +999,55 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 static int
-select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
+        struct task_struct *curr;
+        struct rq *rq;
+        int cpu;
        if (sd_flag != SD_BALANCE_WAKE)
                return smp_processor_id();
+        cpu = task_cpu(p);
+        rq = cpu_rq(cpu);
+        rcu_read_lock();
+        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
        /*
-         * If the current task is an RT task, then
+         * If the current task on @p's runqueue is an RT task, then
         * try to see if we can wake this RT task up on another
         * runqueue. Otherwise simply start this RT task
         * on its current runqueue.
         *
-         * We want to avoid overloading runqueues. Even if
+         * We want to avoid overloading runqueues. If the woken
-         * the RT task is of higher priority than the current RT task.
+         * task is a higher priority, then it will stay on this CPU
-         * RT tasks behave differently than other tasks. If
+         * and the lower prio task should be moved to another CPU.
-         * one gets preempted, we try to push it off to another queue.
+         * Even though this will probably make the lower prio task
-         * So trying to keep a preempting RT task on the same
+         * lose its cache, we do not want to bounce a higher task
-         * cache hot CPU will force the running RT task to
+         * around just because it gave up its CPU, perhaps for a
-         * a cold CPU. So we waste all the cache for the lower
+         * lock?
-         * RT task in hopes of saving some of a RT task
+         *
-         * that is just being woken and probably will have
+         * For equal prio tasks, we just let the scheduler sort it out.
-         * cold cache anyway.
+         *
+         * Otherwise, just let it ride on the affined RQ and the
+         * post-schedule router will push the preempted task away
+         *
+         * This test is optimistic, if we get it wrong the load-balancer
+         * will have to sort it out.
         */
-        if (unlikely(rt_task(rq->curr)) &&
+        if (curr && unlikely(rt_task(curr)) &&
+            (curr->rt.nr_cpus_allowed < 2 ||
+             curr->prio < p->prio) &&
            (p->rt.nr_cpus_allowed > 1)) {
-                int cpu = find_lowest_rq(p);
+                int target = find_lowest_rq(p);
-                return (cpu == -1) ? task_cpu(p) : cpu;
+                if (target != -1)
+                        cpu = target;
        }
+        rcu_read_unlock();
-        /*
+        return cpu;
-         * Otherwise, just let it ride on the affined RQ and the
-         * post-schedule router will push the preempted task away
-         */
-        return task_cpu(p);
 }
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1031,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
         * to move current somewhere else, making room for our non-migratable
         * task.
         */
-        if (p->prio == rq->curr->prio && !need_resched())
+        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
                check_preempt_equal_prio(rq, p);
 #endif
 }
@@ -1074,7 +1139,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        } while (rt_rq);
        p = rt_task_of(rt_se);
-        p->se.exec_start = rq->clock;
+        p->se.exec_start = rq->clock_task;
        return p;
 }
@@ -1107,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1139,7 +1204,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
        for_each_leaf_rt_rq(rt_rq, rq) {
                array = &rt_rq->active;
                idx = sched_find_first_bit(array->bitmap);
- next_idx:
+next_idx:
                if (idx >= MAX_RT_PRIO)
                        continue;
                if (next && next->prio < idx)
@@ -1174,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task)
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
+        /* Make sure the mask is initialized first */
+        if (unlikely(!lowest_mask))
+                return -1;
        if (task->rt.nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
@@ -1198,6 +1267,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (!cpumask_test_cpu(this_cpu, lowest_mask))
                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
                if (sd->flags & SD_WAKE_AFFINE) {
                        int best_cpu;
@@ -1207,15 +1277,20 @@ static int find_lowest_rq(struct task_struct *task)
                         * remote processor.
                         */
                        if (this_cpu != -1 &&
-                            cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
                                return this_cpu;
+                        }
                        best_cpu = cpumask_first_and(lowest_mask,
                                                     sched_domain_span(sd));
-                        if (best_cpu < nr_cpu_ids)
+                        if (best_cpu < nr_cpu_ids) {
+                                rcu_read_unlock();
                                return best_cpu;
+                        }
                }
        }
+        rcu_read_unlock();
        /*
         * And finally, if there were no matches within the domains
@@ -1258,7 +1333,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     !cpumask_test_cpu(lowest_rq->cpu,
                                                       &task->cpus_allowed) ||
                                     task_running(rq, task) ||
-                                     !task->se.on_rq)) {
+                                     !task->on_rq)) {
                                raw_spin_unlock(&lowest_rq->lock);
                                lowest_rq = NULL;
@@ -1292,7 +1367,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->rt.nr_cpus_allowed <= 1);
-        BUG_ON(!p->se.on_rq);
+        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
        return p;
@@ -1315,7 +1390,7 @@ static int push_rt_task(struct rq *rq)
        if (!next_task)
                return 0;
- retry:
+retry:
        if (unlikely(next_task == rq->curr)) {
                WARN_ON(1);
                return 0;
@@ -1349,7 +1424,7 @@ static int push_rt_task(struct rq *rq)
                task = pick_next_pushable_task(rq);
                if (task_cpu(next_task) == rq->cpu && task == next_task) {
                        /*
-                         * If we get here, the task hasnt moved at all, but
+                         * If we get here, the task hasn't moved at all, but
                         * it has failed to push.  We will not try again,
                         * since the other cpus will pull from us when they
                         * are ready.
@@ -1438,7 +1513,7 @@ static int pull_rt_task(struct rq *this_rq)
                 */
                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->se.on_rq);
+                        WARN_ON(!p->on_rq);
                        /*
                         * There's a chance that p is higher in priority
@@ -1459,11 +1534,11 @@ static int pull_rt_task(struct rq *this_rq)
                        /*
                         * We continue with the search, just in
                         * case there's an even higher prio task
-                         * in another runqueue. (low likelyhood
+                         * in another runqueue. (low likelihood
                         * but possible)
                         */
                }
- skip:
+skip:
                double_unlock_balance(this_rq, src_rq);
        }
@@ -1491,7 +1566,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1)
+            p->rt.nr_cpus_allowed > 1 &&
+            rt_task(rq->curr) &&
+            (rq->curr->rt.nr_cpus_allowed < 2 ||
+             rq->curr->prio < p->prio))
                push_rt_tasks(rq);
 }
@@ -1506,7 +1584,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
         * Update the migration status of the RQ if we have an RT task
         * which is running AND changing its weight value.
         */
-        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
                struct rq *rq = task_rq(p);
                if (!task_current(rq, p)) {
@@ -1567,8 +1645,7 @@ static void rq_offline_rt(struct rq *rq)
 * When switch from the rt queue, we bring ourselves to a position
 * that we might want to pull RT tasks from other runqueues.
 */
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
-                           int running)
 {
        /*
         * If there are other RT tasks then we will reschedule
@@ -1577,7 +1654,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (!rq->rt.rt_nr_running)
+        if (p->on_rq && !rq->rt.rt_nr_running)
                pull_rt_task(rq);
 }
@@ -1596,8 +1673,7 @@ static inline void init_sched_rt_class(void)
 * with RT tasks. In this case we try to push them off to
 * other runqueues.
 */
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
-                           int running)
 {
        int check_resched = 1;
@@ -1608,7 +1684,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
         * If that current running task is also an RT task
         * then see if we can move to another run queue.
         */
-        if (!running) {
+        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->rt.overloaded && push_rt_task(rq) &&
                    /* Don't resched if we changed runqueues */
@@ -1624,10 +1700,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
 * Priority of the task has changed. This may cause
 * us to initiate a push or pull.
 */
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+static void
-                            int oldprio, int running)
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (running) {
+        if (!p->on_rq)
+                return;
+        if (rq->curr == p) {
 #ifdef CONFIG_SMP
                /*
                 * If our priority decreases while running, we
@@ -1709,7 +1788,7 @@ static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
-        p->se.exec_start = rq->clock;
+        p->se.exec_start = rq->clock_task;
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
@@ -1763,10 +1842,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 static void print_rt_stats(struct seq_file *m, int cpu)
 {
+        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
        rcu_read_lock();
-        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
                print_rt_rq(m, cpu, rt_rq);
        rcu_read_unlock();
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 #ifdef CONFIG_SMP
                /* domain-specific stats */
-                preempt_disable();
+                rcu_read_lock();
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
                            sd->ttwu_move_balance);
                }
-                preempt_enable();
+                rcu_read_unlock();
 #endif
        }
        kfree(mask_str);
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 }
 /*
- * Called when a process is dequeued from the active array and given
+ * We are interested in knowing how long it was from the *first* time a
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * Though we are interested in knowing how long it was from the *first* time a
 * task was queued to the time that it finally hit a cpu, we call this routine
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
 }
 /*
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
 * This function is only called from enqueue_task(), but also only updates
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..6f437632afab
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,104 @@
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+{
+        return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+        /* we're never preempted */
+}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+        struct task_struct *stop = rq->stop;
+        if (stop && stop->on_rq)
+                return stop;
+        return NULL;
+}
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void yield_task_stop(struct rq *rq)
+{
+        BUG(); /* the stop task should never yield, its pointless. */
+}
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
+{
+        BUG(); /* its impossible to change to this class */
+}
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+{
+        BUG(); /* how!?, what priority? */
+}
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+        return 0;
+}
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+static const struct sched_class stop_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_stop,
+        .dequeue_task           = dequeue_task_stop,
+        .yield_task             = yield_task_stop,
+        .check_preempt_curr     = check_preempt_curr_stop,
+        .pick_next_task         = pick_next_task_stop,
+        .put_prev_task          = put_prev_task_stop,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_stop,
+#endif
+        .set_curr_task          = set_curr_task_stop,
+        .task_tick              = task_tick_stop,
+        .get_rr_interval        = get_rr_interval_stop,
+        .prio_changed           = prio_changed_stop,
+        .switched_to            = switched_to_stop,
+};
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..415d85d6f6c6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-        if (t->signal->group_stop_count > 0 ||
+        if ((t->group_stop & GROUP_STOP_PENDING) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -223,10 +223,87 @@ static inline void print_dropped_signal(int sig)
                                current->comm, current->pid, sig);
 }
+/**
+ * task_clear_group_stop_trapping - clear group stop trapping bit
+ * @task: target task
+ *
+ * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us.  Clear it
+ * and wake up the ptracer.  Note that we don't need any further locking.
+ * @task->siglock guarantees that @task->parent points to the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void task_clear_group_stop_trapping(struct task_struct *task)
+{
+        if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
+                task->group_stop &= ~GROUP_STOP_TRAPPING;
+                __wake_up_sync_key(&task->parent->signal->wait_chldexit,
+                                   TASK_UNINTERRUPTIBLE, 1, task);
+        }
+}
+/**
+ * task_clear_group_stop_pending - clear pending group stop
+ * @task: target task
+ *
+ * Clear group stop states for @task.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_group_stop_pending(struct task_struct *task)
+{
+        task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
+                              GROUP_STOP_DEQUEUED);
+}
+/**
+ * task_participate_group_stop - participate in a group stop
+ * @task: task participating in a group stop
+ *
+ * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * Group stop states are cleared and the group stop count is consumed if
+ * %GROUP_STOP_CONSUME was set.  If the consumption completes the group
+ * stop, the appropriate %SIGNAL_* flags are set.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if group stop completion should be notified to the parent, %false
+ * otherwise.
+ */
+static bool task_participate_group_stop(struct task_struct *task)
+{
+        struct signal_struct *sig = task->signal;
+        bool consume = task->group_stop & GROUP_STOP_CONSUME;
+        WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+        task_clear_group_stop_pending(task);
+        if (!consume)
+                return false;
+        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
+                sig->group_stop_count--;
+        /*
+         * Tell the caller to notify completion iff we are entering into a
+         * fresh group stop.  Read comment in do_signal_stop() for details.
+         */
+        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
+                sig->flags = SIGNAL_STOP_STOPPED;
+                return true;
+        }
+        return false;
+}
 /*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to stop the target task from exiting
+ *   appropriate lock must be held to stop the target task from exiting
 */
 static struct sigqueue *
 __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +452,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
        return !tracehook_consider_fatal_signal(tsk, sig);
 }
+/*
-/* Notify the system that a driver wants to block all signals for this
+ * Notify the system that a driver wants to block all signals for this
 * process, and wants to be notified if any signals at all were to be
 * sent/acted upon.  If the notifier routine returns non-zero, then the
 * signal will be acted upon after all.  If the notifier routine returns 0,
 * then then signal will be blocked.  Only one block per process is
 * allowed.  priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.  */
+ * can use to determine if the signal should be blocked or not.
+ */
 void
 block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
 {
@@ -434,9 +511,10 @@ still_pending:
                copy_siginfo(info, &first->info);
                __sigqueue_free(first);
        } else {
-                /* Ok, it wasn't in the queue.  This must be
+                /*
-                   a fast-pathed signal or we must have been
+                 * Ok, it wasn't in the queue.  This must be
-                   out of queue space.  So zero out the info.
+                 * a fast-pathed signal or we must have been
+                 * out of queue space.  So zero out the info.
                 */
                info->si_signo = sig;
                info->si_errno = 0;
@@ -468,7 +546,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 }
 /*
- * Dequeue a signal and return the element to the caller, which is 
+ * Dequeue a signal and return the element to the caller, which is
 * expected to free it.
 *
 * All callers have to hold the siglock.
@@ -490,7 +568,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * itimers are process shared and we restart periodic
                 * itimers in the signal delivery path to prevent DoS
                 * attacks in the high resolution timer case. This is
-                 * compliant with the old way of self restarting
+                 * compliant with the old way of self-restarting
                 * itimers, as the SIGALRM is a legacy signal and only
                 * queued once. Changing the restart behaviour to
                 * restart the timer in the signal dequeue path is
@@ -526,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+                current->group_stop |= GROUP_STOP_DEQUEUED;
        }
        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
                /*
@@ -591,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
        if (sigisemptyset(&m))
                return 0;
-        signandsets(&s->signal, &s->signal, mask);
+        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
@@ -636,13 +714,33 @@ static inline bool si_fromuser(const struct siginfo *info)
 }
 /*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+        const struct cred *cred = current_cred();
+        const struct cred *tcred = __task_cred(t);
+        if (cred->user->user_ns == tcred->user->user_ns &&
+            (cred->euid == tcred->suid ||
+             cred->euid == tcred->uid ||
+             cred->uid  == tcred->suid ||
+             cred->uid  == tcred->uid))
+                return 1;
+        if (ns_capable(tcred->user->user_ns, CAP_KILL))
+                return 1;
+        return 0;
+}
+/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
 static int check_kill_permission(int sig, struct siginfo *info,
                                 struct task_struct *t)
 {
-        const struct cred *cred, *tcred;
        struct pid *sid;
        int error;
@@ -656,14 +754,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (error)
                return error;
-        cred = current_cred();
-        tcred = __task_cred(t);
        if (!same_thread_group(current, t) &&
-            (cred->euid ^ tcred->suid) &&
+            !kill_ok_by_cred(t)) {
-            (cred->euid ^ tcred->uid) &&
-            (cred->uid  ^ tcred->suid) &&
-            (cred->uid  ^ tcred->uid) &&
-            !capable(CAP_KILL)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
@@ -712,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
-                 * Remove all stop signals from all queues,
+                 * Remove all stop signals from all queues, wake all threads.
-                 * and wake all threads.
                 */
                rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
                t = p;
                do {
-                        unsigned int state;
+                        task_clear_group_stop_pending(t);
                        rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-                        /*
+                        wake_up_state(t, __TASK_STOPPED);
-                         * If there is a handler for SIGCONT, we must make
-                         * sure that no thread returns to user mode before
-                         * we post the signal, in case it was the only
-                         * thread eligible to run the signal handler--then
-                         * it must not do anything between resuming and
-                         * running the handler.  With the TIF_SIGPENDING
-                         * flag set, the thread will pause and acquire the
-                         * siglock that we hold now and until we've queued
-                         * the pending signal.
-                         *
-                         * Wake up the stopped thread _after_ setting
-                         * TIF_SIGPENDING
-                         */
-                        state = __TASK_STOPPED;
-                        if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
-                                set_tsk_thread_flag(t, TIF_SIGPENDING);
-                                state |= TASK_INTERRUPTIBLE;
-                        }
-                        wake_up_state(t, state);
                } while_each_thread(p, t);
                /*
@@ -765,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
                        signal->flags = why | SIGNAL_STOP_CONTINUED;
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
-                } else {
-                        /*
-                         * We are not stopped, but there could be a stop
-                         * signal in the middle of being processed after
-                         * being removed from the queue.  Clear that too.
-                         */
-                        signal->flags &= ~SIGNAL_STOP_DEQUEUED;
                }
        }
@@ -860,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
                        signal->group_stop_count = 0;
                        t = p;
                        do {
+                                task_clear_group_stop_pending(t);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        } while_each_thread(p, t);
@@ -909,14 +975,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        if (info == SEND_SIG_FORCED)
                goto out_set;
-        /* Real-time signals must be queued if sent by sigqueue, or
+        /*
-           some other real-time mechanism.  It is implementation
+         * Real-time signals must be queued if sent by sigqueue, or
-           defined whether kill() does so.  We attempt to do so, on
+         * some other real-time mechanism.  It is implementation
-           the principle of least surprise, but since kill is not
+         * defined whether kill() does so.  We attempt to do so, on
-           allowed to fail with EAGAIN when low on memory we just
+         * the principle of least surprise, but since kill is not
-           make sure at least one signal gets delivered and don't
+         * allowed to fail with EAGAIN when low on memory we just
-           pass on the info struct.  */
+         * make sure at least one signal gets delivered and don't
+         * pass on the info struct.
+         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
@@ -1093,6 +1160,7 @@ int zap_other_threads(struct task_struct *p)
        p->signal->group_stop_count = 0;
        while_each_thread(p, t) {
+                task_clear_group_stop_pending(t);
                count++;
                /* Don't bother with already dead threads */
@@ -1105,22 +1173,30 @@ int zap_other_threads(struct task_struct *p)
        return count;
 }
-struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+                                           unsigned long *flags)
 {
        struct sighand_struct *sighand;
-        rcu_read_lock();
        for (;;) {
+                local_irq_save(*flags);
+                rcu_read_lock();
                sighand = rcu_dereference(tsk->sighand);
-                if (unlikely(sighand == NULL))
+                if (unlikely(sighand == NULL)) {
+                        rcu_read_unlock();
+                        local_irq_restore(*flags);
                        break;
+                }
-                spin_lock_irqsave(&sighand->siglock, *flags);
+                spin_lock(&sighand->siglock);
-                if (likely(sighand == tsk->sighand))
+                if (likely(sighand == tsk->sighand)) {
+                        rcu_read_unlock();
                        break;
-                spin_unlock_irqrestore(&sighand->siglock, *flags);
+                }
+                spin_unlock(&sighand->siglock);
+                rcu_read_unlock();
+                local_irq_restore(*flags);
        }
-        rcu_read_unlock();
        return sighand;
 }
@@ -1186,8 +1262,7 @@ retry:
        return error;
 }
-int
+int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
        int error;
        rcu_read_lock();
@@ -1284,8 +1359,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 * These are for backward compatibility with the rest of the kernel source.
 */
-int
+int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
        /*
         * Make sure legacy kernel users don't send in bad values
@@ -1353,7 +1427,7 @@ EXPORT_SYMBOL(kill_pid);
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
 * afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions".  In the case of Posix Timers
+ * expirations or I/O completions".  In the case of POSIX Timers
 * we allocate the sigqueue structure from the timer_create.  If this
 * allocation fails we are able to report the failure to the application
 * with an EAGAIN error.
@@ -1521,16 +1595,30 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        return ret;
 }
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
+/**
+ * do_notify_parent_cldstop - notify parent of stopped/continued state change
+ * @tsk: task reporting the state change
+ * @for_ptracer: the notification is for ptracer
+ * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
+ *
+ * Notify @tsk's parent that the stopped/continued state has changed.  If
+ * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
+ * If %true, @tsk reports to @tsk->parent which should be the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with tasklist_lock at least read locked.
+ */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+                                     bool for_ptracer, int why)
 {
        struct siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
-        if (task_ptrace(tsk))
+        if (for_ptracer) {
                parent = tsk->parent;
-        else {
+        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }
@@ -1538,7 +1626,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
-         * see comment in do_notify_parent() abot the following 3 lines
+         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1596,7 +1684,7 @@ static inline int may_ptrace_stop(void)
 }
 /*
- * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Return non-zero if there is a SIGKILL that should be waking us up.
 * Called with the siglock held.
 */
 static int sigkill_pending(struct task_struct *tsk)
@@ -1606,6 +1694,15 @@ static int sigkill_pending(struct task_struct *tsk)
 }
 /*
+ * Test whether the target task of the usual cldstop notification - the
+ * real_parent of @child - is in the same group as the ptracer.
+ */
+static bool real_parent_is_ptracer(struct task_struct *child)
+{
+        return same_thread_group(child->parent, child->real_parent);
+}
+/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
@@ -1616,8 +1713,12 @@ static int sigkill_pending(struct task_struct *tsk)
 * If we actually decide not to stop at all because the tracer
 * is gone, we keep current->exit_code unless clear_code.
 */
-static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
+        __releases(&current->sighand->siglock)
+        __acquires(&current->sighand->siglock)
 {
+        bool gstop_done = false;
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
@@ -1638,21 +1739,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        }
        /*
-         * If there is a group stop in progress,
+         * If @why is CLD_STOPPED, we're trapping to participate in a group
-         * we must participate in the bookkeeping.
+         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
+         * while siglock was released for the arch hook, PENDING could be
+         * clear now.  We act as if SIGCONT is received after TASK_TRACED
+         * is entered - ignore it.
         */
-        if (current->signal->group_stop_count > 0)
+        if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
-                --current->signal->group_stop_count;
+                gstop_done = task_participate_group_stop(current);
        current->last_siginfo = info;
        current->exit_code = exit_code;
-        /* Let the debugger run.  */
+        /*
-        __set_current_state(TASK_TRACED);
+         * TRACED should be visible before TRAPPING is cleared; otherwise,
+         * the tracer might fail do_wait().
+         */
+        set_current_state(TASK_TRACED);
+        /*
+         * We're committing to trapping.  Clearing GROUP_STOP_TRAPPING and
+         * transition to TASK_TRACED should be atomic with respect to
+         * siglock.  This hsould be done after the arch hook as siglock is
+         * released and regrabbed across it.
+         */
+        task_clear_group_stop_trapping(current);
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        if (may_ptrace_stop()) {
-                do_notify_parent_cldstop(current, CLD_TRAPPED);
+                /*
+                 * Notify parents of the stop.
+                 *
+                 * While ptraced, there are two parents - the ptracer and
+                 * the real_parent of the group_leader.  The ptracer should
+                 * know about every stop while the real parent is only
+                 * interested in the completion of group stop.  The states
+                 * for the two don't interact with each other.  Notify
+                 * separately unless they're gonna be duplicates.
+                 */
+                do_notify_parent_cldstop(current, true, why);
+                if (gstop_done && !real_parent_is_ptracer(current))
+                        do_notify_parent_cldstop(current, false, why);
                /*
                 * Don't want to allow preemption here, because
                 * sys_ptrace() needs this task to be inactive.
@@ -1667,7 +1796,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
                /*
                 * By the time we got the lock, our tracer went away.
                 * Don't drop the lock yet, another tracer may come.
+                 *
+                 * If @gstop_done, the ptracer went away between group stop
+                 * completion and here.  During detach, it would have set
+                 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
+                 * in do_signal_stop() on return, so notifying the real
+                 * parent of the group stop completion is enough.
                 */
+                if (gstop_done)
+                        do_notify_parent_cldstop(current, false, why);
                __set_current_state(TASK_RUNNING);
                if (clear_code)
                        current->exit_code = 0;
@@ -1711,79 +1849,128 @@ void ptrace_notify(int exit_code)
        /* Let the debugger run.  */
        spin_lock_irq(&current->sighand->siglock);
-        ptrace_stop(exit_code, 1, &info);
+        ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
        spin_unlock_irq(&current->sighand->siglock);
 }
 /*
 * This performs the stopping for SIGSTOP and other stop signals.
 * We have to stop all threads in the thread group.
- * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns non-zero if we've actually stopped and released the siglock.
 * Returns zero if we didn't stop and still hold the siglock.
 */
 static int do_signal_stop(int signr)
 {
        struct signal_struct *sig = current->signal;
-        int notify;
-        if (!sig->group_stop_count) {
+        if (!(current->group_stop & GROUP_STOP_PENDING)) {
+                unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
                struct task_struct *t;
-                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+                /* signr will be recorded in task->group_stop for retries */
+                WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+                if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
                        return 0;
                /*
-                 * There is no group stop already in progress.
+                 * There is no group stop already in progress.  We must
-                 * We must initiate one now.
+                 * initiate one now.
+                 *
+                 * While ptraced, a task may be resumed while group stop is
+                 * still in effect and then receive a stop signal and
+                 * initiate another group stop.  This deviates from the
+                 * usual behavior as two consecutive stop signals can't
+                 * cause two group stops when !ptraced.  That is why we
+                 * also check !task_is_stopped(t) below.
+                 *
+                 * The condition can be distinguished by testing whether
+                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
+                 * group_exit_code in such case.
+                 *
+                 * This is not necessary for SIGNAL_STOP_CONTINUED because
+                 * an intervening stop signal is required to cause two
+                 * continued events regardless of ptrace.
                 */
-                sig->group_exit_code = signr;
+                if (!(sig->flags & SIGNAL_STOP_STOPPED))
+                        sig->group_exit_code = signr;
+                else
+                        WARN_ON_ONCE(!task_ptrace(current));
+                current->group_stop &= ~GROUP_STOP_SIGMASK;
+                current->group_stop |= signr | gstop;
                sig->group_stop_count = 1;
-                for (t = next_thread(current); t != current; t = next_thread(t))
+                for (t = next_thread(current); t != current;
+                     t = next_thread(t)) {
+                        t->group_stop &= ~GROUP_STOP_SIGMASK;
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
-                        if (!(t->flags & PF_EXITING) &&
+                        if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
-                            !task_is_stopped_or_traced(t)) {
+                                t->group_stop |= signr | gstop;
                                sig->group_stop_count++;
                                signal_wake_up(t, 0);
                        }
+                }
        }
-        /*
+retry:
-         * If there are no other threads in the group, or if there is
+        if (likely(!task_ptrace(current))) {
-         * a group stop in progress and we are the last to stop, report
+                int notify = 0;
-         * to the parent.  When ptraced, every thread reports itself.
-         */
+                /*
-        notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
+                 * If there are no other threads in the group, or if there
-        notify = tracehook_notify_jctl(notify, CLD_STOPPED);
+                 * is a group stop in progress and we are the last to stop,
-        /*
+                 * report to the parent.
-         * tracehook_notify_jctl() can drop and reacquire siglock, so
+                 */
-         * we keep ->group_stop_count != 0 before the call. If SIGCONT
+                if (task_participate_group_stop(current))
-         * or SIGKILL comes in between ->group_stop_count == 0.
+                        notify = CLD_STOPPED;
-         */
-        if (sig->group_stop_count) {
-                if (!--sig->group_stop_count)
-                        sig->flags = SIGNAL_STOP_STOPPED;
-                current->exit_code = sig->group_exit_code;
                __set_current_state(TASK_STOPPED);
+                spin_unlock_irq(&current->sighand->siglock);
+                /*
+                 * Notify the parent of the group stop completion.  Because
+                 * we're not holding either the siglock or tasklist_lock
+                 * here, ptracer may attach inbetween; however, this is for
+                 * group stop and should always be delivered to the real
+                 * parent of the group leader.  The new ptracer will get
+                 * its notification when this task transitions into
+                 * TASK_TRACED.
+                 */
+                if (notify) {
+                        read_lock(&tasklist_lock);
+                        do_notify_parent_cldstop(current, false, notify);
+                        read_unlock(&tasklist_lock);
+                }
+                /* Now we don't run again until woken by SIGCONT or SIGKILL */
+                schedule();
+                spin_lock_irq(&current->sighand->siglock);
+        } else {
+                ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
+                            CLD_STOPPED, 0, NULL);
+                current->exit_code = 0;
        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (notify) {
+        /*
-                read_lock(&tasklist_lock);
+         * GROUP_STOP_PENDING could be set if another group stop has
-                do_notify_parent_cldstop(current, notify);
+         * started since being woken up or ptrace wants us to transit
-                read_unlock(&tasklist_lock);
+         * between TASK_STOPPED and TRACED.  Retry group stop.
+         */
+        if (current->group_stop & GROUP_STOP_PENDING) {
+                WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
+                goto retry;
        }
-        /* Now we don't run again until woken by SIGCONT or SIGKILL */
+        /* PTRACE_ATTACH might have raced with task killing, clear trapping */
-        do {
+        task_clear_group_stop_trapping(current);
-                schedule();
-        } while (try_to_freeze());
+        spin_unlock_irq(&current->sighand->siglock);
        tracehook_finish_jctl();
-        current->exit_code = 0;
        return 1;
 }
@@ -1797,7 +1984,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
        ptrace_signal_deliver(regs, cookie);
        /* Let the debugger run.  */
-        ptrace_stop(signr, 0, info);
+        ptrace_stop(signr, CLD_TRAPPED, 0, info);
        /* We're back.  Did the debugger cancel the sig?  */
        signr = current->exit_code;
@@ -1806,10 +1993,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
        current->exit_code = 0;
-        /* Update the siginfo structure if the signal has
+        /*
-           changed.  If the debugger wanted something
+         * Update the siginfo structure if the signal has
-           specific in the siginfo structure then it should
+         * changed.  If the debugger wanted something
-           have updated *info via PTRACE_SETSIGINFO.  */
+         * specific in the siginfo structure then it should
+         * have updated *info via PTRACE_SETSIGINFO.
+         */
        if (signr != info->si_signo) {
                info->si_signo = signr;
                info->si_errno = 0;
@@ -1850,25 +2039,43 @@ relock:
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
-                int why = (signal->flags & SIGNAL_STOP_CONTINUED)
+                struct task_struct *leader;
-                                ? CLD_CONTINUED : CLD_STOPPED;
+                int why;
+                if (signal->flags & SIGNAL_CLD_CONTINUED)
+                        why = CLD_CONTINUED;
+                else
+                        why = CLD_STOPPED;
                signal->flags &= ~SIGNAL_CLD_MASK;
-                why = tracehook_notify_jctl(why, CLD_CONTINUED);
                spin_unlock_irq(&sighand->siglock);
-                if (why) {
+                /*
-                        read_lock(&tasklist_lock);
+                 * Notify the parent that we're continuing.  This event is
-                        do_notify_parent_cldstop(current->group_leader, why);
+                 * always per-process and doesn't make whole lot of sense
-                        read_unlock(&tasklist_lock);
+                 * for ptracers, who shouldn't consume the state via
-                }
+                 * wait(2) either, but, for backward compatibility, notify
+                 * the ptracer of the group leader too unless it's gonna be
+                 * a duplicate.
+                 */
+                read_lock(&tasklist_lock);
+                do_notify_parent_cldstop(current, false, why);
+                leader = current->group_leader;
+                if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
+                        do_notify_parent_cldstop(leader, true, why);
+                read_unlock(&tasklist_lock);
                goto relock;
        }
        for (;;) {
                struct k_sigaction *ka;
                /*
-                 * Tracing can induce an artifical signal and choose sigaction.
+                 * Tracing can induce an artificial signal and choose sigaction.
                 * The return value in @signr determines the default action,
                 * but @info->si_signo is the signal number we will report.
                 */
@@ -1878,8 +2085,8 @@ relock:
                if (unlikely(signr != 0))
                        ka = return_ka;
                else {
-                        if (unlikely(signal->group_stop_count > 0) &&
+                        if (unlikely(current->group_stop &
-                            do_signal_stop(0))
+                                     GROUP_STOP_PENDING) && do_signal_stop(0))
                                goto relock;
                        signr = dequeue_signal(current, &current->blocked,
@@ -1998,10 +2205,42 @@ relock:
        return signr;
 }
+/*
+ * It could be that complete_signal() picked us to notify about the
+ * group-wide signal. Other threads should be notified now to take
+ * the shared signals in @which since we will not.
+ */
+static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
+{
+        sigset_t retarget;
+        struct task_struct *t;
+        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
+        if (sigisemptyset(&retarget))
+                return;
+        t = tsk;
+        while_each_thread(tsk, t) {
+                if (t->flags & PF_EXITING)
+                        continue;
+                if (!has_pending_signals(&retarget, &t->blocked))
+                        continue;
+                /* Remove the signals this thread can handle. */
+                sigandsets(&retarget, &retarget, &t->blocked);
+                if (!signal_pending(t))
+                        signal_wake_up(t, 0);
+                if (sigisemptyset(&retarget))
+                        break;
+        }
+}
 void exit_signals(struct task_struct *tsk)
 {
        int group_stop = 0;
-        struct task_struct *t;
+        sigset_t unblocked;
        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
                tsk->flags |= PF_EXITING;
@@ -2017,25 +2256,23 @@ void exit_signals(struct task_struct *tsk)
        if (!signal_pending(tsk))
                goto out;
-        /* It could be that __group_complete_signal() choose us to
+        unblocked = tsk->blocked;
-         * notify about group-wide signal. Another thread should be
+        signotset(&unblocked);
-         * woken now to take the signal since we will not.
+        retarget_shared_pending(tsk, &unblocked);
-         */
-        for (t = tsk; (t = next_thread(t)) != tsk; )
-                if (!signal_pending(t) && !(t->flags & PF_EXITING))
-                        recalc_sigpending_and_wake(t);
-        if (unlikely(tsk->signal->group_stop_count) &&
+        if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
-                        !--tsk->signal->group_stop_count) {
+            task_participate_group_stop(tsk))
-                tsk->signal->flags = SIGNAL_STOP_STOPPED;
+                group_stop = CLD_STOPPED;
-                group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
-        }
 out:
        spin_unlock_irq(&tsk->sighand->siglock);
+        /*
+         * If group stop has completed, deliver the notification.  This
+         * should always go to the real parent of the group leader.
+         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
-                do_notify_parent_cldstop(tsk, group_stop);
+                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
 }
@@ -2055,6 +2292,9 @@ EXPORT_SYMBOL(unblock_all_signals);
 * System call entry points.
 */
+/**
+ *  sys_restart_syscall - restart a system call
+ */
 SYSCALL_DEFINE0(restart_syscall)
 {
        struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2066,11 +2306,33 @@ long do_no_restart_syscall(struct restart_block *param)
        return -EINTR;
 }
-/*
+static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
- * We don't need to get the kernel lock - this is all local to this
+{
- * particular thread.. (and that's good, because this is _heavily_
+        if (signal_pending(tsk) && !thread_group_empty(tsk)) {
- * used by various programs)
+                sigset_t newblocked;
+                /* A set of now blocked but previously unblocked signals. */
+                sigandnsets(&newblocked, newset, &current->blocked);
+                retarget_shared_pending(tsk, &newblocked);
+        }
+        tsk->blocked = *newset;
+        recalc_sigpending();
+}
+/**
+ * set_current_blocked - change current->blocked mask
+ * @newset: new mask
+ *
+ * It is wrong to change ->blocked directly, this helper should be used
+ * to ensure the process can't miss a shared signal we are going to block.
 */
+void set_current_blocked(const sigset_t *newset)
+{
+        struct task_struct *tsk = current;
+        spin_lock_irq(&tsk->sighand->siglock);
+        __set_task_blocked(tsk, newset);
+        spin_unlock_irq(&tsk->sighand->siglock);
+}
 /*
 * This is also useful for kernel threads that want to temporarily
@@ -2082,66 +2344,66 @@ long do_no_restart_syscall(struct restart_block *param)
 */
 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 {
-        int error;
+        struct task_struct *tsk = current;
+        sigset_t newset;
-        spin_lock_irq(&current->sighand->siglock);
+        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
-                *oldset = current->blocked;
+                *oldset = tsk->blocked;
-        error = 0;
        switch (how) {
        case SIG_BLOCK:
-                sigorsets(&current->blocked, &current->blocked, set);
+                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
-                signandsets(&current->blocked, &current->blocked, set);
+                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
-                current->blocked = *set;
+                newset = *set;
                break;
        default:
-                error = -EINVAL;
+                return -EINVAL;
        }
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
-        return error;
+        set_current_blocked(&newset);
+        return 0;
 }
-SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+/**
+ *  sys_rt_sigprocmask - change the list of currently blocked signals
+ *  @how: whether to add, remove, or set signals
+ *  @nset: stores pending signals
+ *  @oset: previous value of signal mask if non-null
+ *  @sigsetsize: size of sigset_t type
+ */
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
 {
-        int error = -EINVAL;
        sigset_t old_set, new_set;
+        int error;
        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
-                goto out;
+                return -EINVAL;
-        if (set) {
+        old_set = current->blocked;
-                error = -EFAULT;
-                if (copy_from_user(&new_set, set, sizeof(*set)))
+        if (nset) {
-                        goto out;
+                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
+                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
-                error = sigprocmask(how, &new_set, &old_set);
+                error = sigprocmask(how, &new_set, NULL);
                if (error)
-                        goto out;
+                        return error;
-                if (oset)
+        }
-                        goto set_old;
-        } else if (oset) {
-                spin_lock_irq(&current->sighand->siglock);
-                old_set = current->blocked;
-                spin_unlock_irq(&current->sighand->siglock);
-        set_old:
+        if (oset) {
-                error = -EFAULT;
+                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
-                if (copy_to_user(oset, &old_set, sizeof(*oset)))
+                        return -EFAULT;
-                        goto out;
        }
-        error = 0;
-out:
+        return 0;
-        return error;
 }
 long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2166,8 +2428,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
 out:
        return error;
-}       
+}
+/**
+ *  sys_rt_sigpending - examine a pending signal that has been raised
+ *                      while blocked
+ *  @set: stores pending signals
+ *  @sigsetsize: size of sigset_t type or larger
+ */
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
 {
        return do_sigpending(set, sigsetsize);
@@ -2216,9 +2484,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
                err |= __put_user(from->si_trapno, &to->si_trapno);
 #endif
 #ifdef BUS_MCEERR_AO
-                /* 
+                /*
                 * Other callers might not initialize the si_lsb field,
-                 * so check explicitely for the right codes here.
+                 * so check explicitly for the right codes here.
                 */
                if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
                        err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2247,15 +2515,82 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 #endif
+/**
+ *  do_sigtimedwait - wait for queued signals specified in @which
+ *  @which: queued signals to wait for
+ *  @info: if non-null, the signal's siginfo is returned here
+ *  @ts: upper bound on process time suspension
+ */
+int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+                        const struct timespec *ts)
+{
+        struct task_struct *tsk = current;
+        long timeout = MAX_SCHEDULE_TIMEOUT;
+        sigset_t mask = *which;
+        int sig;
+        if (ts) {
+                if (!timespec_valid(ts))
+                        return -EINVAL;
+                timeout = timespec_to_jiffies(ts);
+                /*
+                 * We can be close to the next tick, add another one
+                 * to ensure we will wait at least the time asked for.
+                 */
+                if (ts->tv_sec || ts->tv_nsec)
+                        timeout++;
+        }
+        /*
+         * Invert the set of allowed signals to get those we want to block.
+         */
+        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+        signotset(&mask);
+        spin_lock_irq(&tsk->sighand->siglock);
+        sig = dequeue_signal(tsk, &mask, info);
+        if (!sig && timeout) {
+                /*
+                 * None ready, temporarily unblock those we're interested
+                 * while we are sleeping in so that we'll be awakened when
+                 * they arrive. Unblocking is always fine, we can avoid
+                 * set_current_blocked().
+                 */
+                tsk->real_blocked = tsk->blocked;
+                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
+                recalc_sigpending();
+                spin_unlock_irq(&tsk->sighand->siglock);
+                timeout = schedule_timeout_interruptible(timeout);
+                spin_lock_irq(&tsk->sighand->siglock);
+                __set_task_blocked(tsk, &tsk->real_blocked);
+                siginitset(&tsk->real_blocked, 0);
+                sig = dequeue_signal(tsk, &mask, info);
+        }
+        spin_unlock_irq(&tsk->sighand->siglock);
+        if (sig)
+                return sig;
+        return timeout ? -EINTR : -EAGAIN;
+}
+/**
+ *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
+ *                      in @uthese
+ *  @uthese: queued signals to wait for
+ *  @uinfo: if non-null, the signal's siginfo is returned here
+ *  @uts: upper bound on process time suspension
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo, const struct timespec __user *, uts,
                size_t, sigsetsize)
 {
-        int ret, sig;
        sigset_t these;
        struct timespec ts;
        siginfo_t info;
-        long timeout = 0;
+        int ret;
        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
@@ -2263,65 +2598,27 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;
-                
-        /*
-         * Invert the set of allowed signals to get those we
-         * want to block.
-         */
-        sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
-        signotset(&these);
        if (uts) {
                if (copy_from_user(&ts, uts, sizeof(ts)))
                        return -EFAULT;
-                if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
-                    || ts.tv_sec < 0)
-                        return -EINVAL;
        }
-        spin_lock_irq(&current->sighand->siglock);
+        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
-        sig = dequeue_signal(current, &these, &info);
-        if (!sig) {
-                timeout = MAX_SCHEDULE_TIMEOUT;
-                if (uts)
-                        timeout = (timespec_to_jiffies(&ts)
-                                   + (ts.tv_sec || ts.tv_nsec));
-                if (timeout) {
-                        /* None ready -- temporarily unblock those we're
-                         * interested while we are sleeping in so that we'll
-                         * be awakened when they arrive.  */
-                        current->real_blocked = current->blocked;
-                        sigandsets(&current->blocked, &current->blocked, &these);
-                        recalc_sigpending();
-                        spin_unlock_irq(&current->sighand->siglock);
-                        timeout = schedule_timeout_interruptible(timeout);
-                        spin_lock_irq(&current->sighand->siglock);
-                        sig = dequeue_signal(current, &these, &info);
-                        current->blocked = current->real_blocked;
-                        siginitset(&current->real_blocked, 0);
-                        recalc_sigpending();
-                }
-        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (sig) {
+        if (ret > 0 && uinfo) {
-                ret = sig;
+                if (copy_siginfo_to_user(uinfo, &info))
-                if (uinfo) {
+                        ret = -EFAULT;
-                        if (copy_siginfo_to_user(uinfo, &info))
-                                ret = -EFAULT;
-                }
-        } else {
-                ret = -EAGAIN;
-                if (timeout)
-                        ret = -EINTR;
        }
        return ret;
 }
+/**
+ *  sys_kill - send a signal to a process
+ *  @pid: the PID of the process
+ *  @sig: signal to be sent
+ */
 SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
        struct siginfo info;
@@ -2397,7 +2694,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
        return do_tkill(tgid, pid, sig);
 }
-/*
+/**
+ *  sys_tkill - send signal to one specific task
+ *  @pid: the PID of the task
+ *  @sig: signal to be sent
+ *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
 SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2409,6 +2710,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
        return do_tkill(0, pid, sig);
 }
+/**
+ *  sys_rt_sigqueueinfo - send signal information to a signal
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *  @uinfo: signal info to be sent
+ */
 SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
 {
@@ -2418,9 +2725,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                return -EFAULT;
        /* Not even root can pretend to send signals from the kernel.
-           Nor can they impersonate a kill(), which adds source info.  */
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
-        if (info.si_code >= 0)
+         */
+        if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info.si_code < 0);
                return -EPERM;
+        }
        info.si_signo = sig;
        /* POSIX.1b doesn't mention process groups.  */
@@ -2434,9 +2745,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
                return -EINVAL;
        /* Not even root can pretend to send signals from the kernel.
-           Nor can they impersonate a kill(), which adds source info.  */
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
-        if (info->si_code >= 0)
+         */
+        if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
+        }
        info->si_signo = sig;
        return do_send_specific(tgid, pid, sig, info);
@@ -2528,12 +2843,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
                error = -EINVAL;
                /*
-                 *
+                 * Note - this code used to test ss_flags incorrectly:
-                 * Note - this code used to test ss_flags incorrectly
                 *        old code may have been written using ss_flags==0
                 *        to mean ss_flags==SS_ONSTACK (as this was the only
                 *        way that worked) - this fix preserves that older
-                 *        mechanism
+                 *        mechanism.
                 */
                if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
                        goto out;
@@ -2567,6 +2881,10 @@ out:
 #ifdef __ARCH_WANT_SYS_SIGPENDING
+/**
+ *  sys_sigpending - examine pending signals
+ *  @set: where mask of pending signal is returned
+ */
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
        return do_sigpending(set, sizeof(*set));
@@ -2575,60 +2893,65 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #endif
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/* Some platforms have their own version with special arguments others
+/**
-   support only sys_rt_sigprocmask.  */
+ *  sys_sigprocmask - examine and change blocked signals
+ *  @how: whether to add, remove, or set signals
+ *  @nset: signals to add or remove (if non-null)
+ *  @oset: previous value of signal mask if non-null
+ *
+ * Some platforms have their own version with special arguments;
+ * others support only sys_rt_sigprocmask.
+ */
-SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
 {
-        int error;
        old_sigset_t old_set, new_set;
+        sigset_t new_blocked;
-        if (set) {
+        old_set = current->blocked.sig[0];
-                error = -EFAULT;
-                if (copy_from_user(&new_set, set, sizeof(*set)))
+        if (nset) {
-                        goto out;
+                if (copy_from_user(&new_set, nset, sizeof(*nset)))
+                        return -EFAULT;
                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
-                spin_lock_irq(&current->sighand->siglock);
+                new_blocked = current->blocked;
-                old_set = current->blocked.sig[0];
-                error = 0;
                switch (how) {
-                default:
-                        error = -EINVAL;
-                        break;
                case SIG_BLOCK:
-                        sigaddsetmask(&current->blocked, new_set);
+                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
-                        sigdelsetmask(&current->blocked, new_set);
+                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
-                        current->blocked.sig[0] = new_set;
+                        new_blocked.sig[0] = new_set;
                        break;
+                default:
+                        return -EINVAL;
                }
-                recalc_sigpending();
+                set_current_blocked(&new_blocked);
-                spin_unlock_irq(&current->sighand->siglock);
+        }
-                if (error)
-                        goto out;
+        if (oset) {
-                if (oset)
-                        goto set_old;
-        } else if (oset) {
-                old_set = current->blocked.sig[0];
-        set_old:
-                error = -EFAULT;
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
-                        goto out;
+                        return -EFAULT;
        }
-        error = 0;
-out:
+        return 0;
-        return error;
 }
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
 #ifdef __ARCH_WANT_SYS_RT_SIGACTION
+/**
+ *  sys_rt_sigaction - alter an action taken by a process
+ *  @sig: signal to be sent
+ *  @act: new sigaction
+ *  @oact: used to save the previous sigaction
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
@@ -2707,14 +3030,22 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
 SYSCALL_DEFINE0(pause)
 {
-        current->state = TASK_INTERRUPTIBLE;
+        while (!signal_pending(current)) {
-        schedule();
+                current->state = TASK_INTERRUPTIBLE;
+                schedule();
+        }
        return -ERESTARTNOHAND;
 }
 #endif
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+/**
+ *  sys_rt_sigsuspend - replace the signal mask for a value with the
+ *      @unewset value until a signal is received
+ *  @unewset: new signal mask value
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 {
        sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -73,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
        .notifier_call          = hotplug_cfd,
 };
-static int __cpuinit init_call_single_data(void)
+void __init call_function_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int i;
@@ -87,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
        hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
        register_cpu_notifier(&hotplug_cfd_notifier);
-        return 0;
 }
-early_initcall(init_call_single_data);
 /*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
@@ -193,23 +191,52 @@ void generic_smp_call_function_interrupt(void)
         */
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
+                smp_call_func_t func;
+                /*
+                 * Since we walk the list without any locks, we might
+                 * see an entry that was completed, removed from the
+                 * list and is in the process of being reused.
+                 *
+                 * We must check that the cpu is in the cpumask before
+                 * checking the refs, and both must be set before
+                 * executing the callback on this cpu.
+                 */
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+                if (!cpumask_test_cpu(cpu, data->cpumask))
                        continue;
-                data->csd.func(data->csd.info);
+                smp_rmb();
+                if (atomic_read(&data->refs) == 0)
+                        continue;
+                func = data->csd.func;          /* save for later warn */
+                func(data->csd.info);
+                /*
+                 * If the cpu mask is not still set then func enabled
+                 * interrupts (BUG), and this cpu took another smp call
+                 * function interrupt and executed func(info) twice
+                 * on this cpu.  That nested execution decremented refs.
+                 */
+                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+                        WARN(1, "%pf enabled interrupts and double executed\n", func);
+                        continue;
+                }
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
-                if (!refs) {
-                        raw_spin_lock(&call_function.lock);
-                        list_del_rcu(&data->csd.list);
-                        raw_spin_unlock(&call_function.lock);
-                }
                if (refs)
                        continue;
+                WARN_ON(!cpumask_empty(data->cpumask));
+                raw_spin_lock(&call_function.lock);
+                list_del_rcu(&data->csd.list);
+                raw_spin_unlock(&call_function.lock);
                csd_unlock(&data->csd);
        }
@@ -267,7 +294,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 *
 * Returns 0 on success, else a negative status code.
 */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
 {
        struct call_single_data d = {
@@ -336,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_single);
 *      3) any other online cpu in @mask
 */
 int smp_call_function_any(const struct cpumask *mask,
-                          void (*func)(void *info), void *info, int wait)
+                          smp_call_func_t func, void *info, int wait)
 {
        unsigned int cpu;
        const struct cpumask *nodemask;
@@ -416,11 +443,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 * must be disabled when calling this function.
 */
 void smp_call_function_many(const struct cpumask *mask,
-                            void (*func)(void *), void *info, bool wait)
+                            smp_call_func_t func, void *info, bool wait)
 {
        struct call_function_data *data;
        unsigned long flags;
-        int cpu, next_cpu, this_cpu = smp_processor_id();
+        int refs, cpu, next_cpu, this_cpu = smp_processor_id();
        /*
         * Can deadlock when called with interrupts disabled.
@@ -429,9 +456,9 @@ void smp_call_function_many(const struct cpumask *mask,
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
-                     && !oops_in_progress);
+                     && !oops_in_progress && !early_boot_irqs_disabled);
-        /* So, what's a CPU they want? Ignoring this one. */
+        /* Try to fastpath.  So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -454,11 +481,48 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        /* This BUG_ON verifies our reuse assertions and can be removed */
+        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
+        /*
+         * The global call function queue list add and delete are protected
+         * by a lock, but the list is traversed without any lock, relying
+         * on the rcu list add and delete to allow safe concurrent traversal.
+         * We reuse the call function data without waiting for any grace
+         * period after some other cpu removes it from the global queue.
+         * This means a cpu might find our data block as it is being
+         * filled out.
+         *
+         * We hold off the interrupt handler on the other cpu by
+         * ordering our writes to the cpu mask vs our setting of the
+         * refs counter.  We assert only the cpu owning the data block
+         * will set a bit in cpumask, and each bit will only be cleared
+         * by the subject cpu.  Each cpu must first find its bit is
+         * set and then check that refs is set indicating the element is
+         * ready to be processed, otherwise it must skip the entry.
+         *
+         * On the previous iteration refs was set to 0 by another cpu.
+         * To avoid the use of transitivity, set the counter to 0 here
+         * so the wmb will pair with the rmb in the interrupt handler.
+         */
+        atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
        data->csd.func = func;
        data->csd.info = info;
+        /* Ensure 0 refs is visible before mask.  Also orders func and info */
+        smp_wmb();
+        /* We rely on the "and" being processed before the store */
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
-        atomic_set(&data->refs, cpumask_weight(data->cpumask));
+        refs = cpumask_weight(data->cpumask);
+        /* Some callers race with other cpus changing the passed mask */
+        if (unlikely(!refs)) {
+                csd_unlock(&data->csd);
+                return;
+        }
        raw_spin_lock_irqsave(&call_function.lock, flags);
        /*
@@ -467,6 +531,12 @@ void smp_call_function_many(const struct cpumask *mask,
         * will not miss any other list entries:
         */
        list_add_rcu(&data->csd.list, &call_function.queue);
+        /*
+         * We rely on the wmb() in list_add_rcu to complete our writes
+         * to the cpumask before this write to refs, which indicates
+         * data is on the list and is ready to be processed.
+         */
+        atomic_set(&data->refs, refs);
        raw_spin_unlock_irqrestore(&call_function.lock, flags);
        /*
@@ -500,7 +570,7 @@ EXPORT_SYMBOL(smp_call_function_many);
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
-int smp_call_function(void (*func)(void *), void *info, int wait)
+int smp_call_function(smp_call_func_t func, void *info, int wait)
 {
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
@@ -529,3 +599,105 @@ void ipi_call_unlock_irq(void)
 {
        raw_spin_unlock_irq(&call_function.lock);
 }
+#endif /* USE_GENERIC_SMP_HELPERS */
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+void __weak arch_disable_smp_support(void) { }
+static int __init nosmp(char *str)
+{
+        setup_max_cpus = 0;
+        arch_disable_smp_support();
+        return 0;
+}
+early_param("nosmp", nosmp);
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+        int nr_cpus;
+        get_option(&str, &nr_cpus);
+        if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+                nr_cpu_ids = nr_cpus;
+        return 0;
+}
+early_param("nr_cpus", nrcpus);
+static int __init maxcpus(char *str)
+{
+        get_option(&str, &setup_max_cpus);
+        if (setup_max_cpus == 0)
+                arch_disable_smp_support();
+        return 0;
+}
+early_param("maxcpus", maxcpus);
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+        unsigned int cpu;
+        /* FIXME: This should be done in userspace --RR */
+        for_each_present_cpu(cpu) {
+                if (num_online_cpus() >= setup_max_cpus)
+                        break;
+                if (!cpu_online(cpu))
+                        cpu_up(cpu);
+        }
+        /* Any cleanup work */
+        printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+        smp_cpus_done(setup_max_cpus);
+}
+/*
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+        unsigned long flags;
+        int ret = 0;
+        preempt_disable();
+        ret = smp_call_function(func, info, wait);
+        local_irq_save(flags);
+        func(info);
+        local_irq_restore(flags);
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,11 +54,11 @@ EXPORT_SYMBOL(irq_stat);
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 char *softirq_to_name[NR_SOFTIRQS] = {
        "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
-        "TASKLET", "SCHED", "HRTIMER",  "RCU"
+        "TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 /*
@@ -67,21 +67,31 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 * to the pending events, so lets the scheduler to balance
 * the softirq load for us.
 */
-void wakeup_softirqd(void)
+static void wakeup_softirqd(void)
 {
        /* Interrupts are disabled: no need to stop preemption */
-        struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+        struct task_struct *tsk = __this_cpu_read(ksoftirqd);
        if (tsk && tsk->state != TASK_RUNNING)
                wake_up_process(tsk);
 }
 /*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ *   softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ *   on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+/*
 * This one is for softirq.c-internal use,
 * where hardirqs are disabled legitimately:
 */
 #ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
        unsigned long flags;
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
         * We must manually increment preempt_count here and manually
         * call the trace_preempt_off later.
         */
-        preempt_count() += SOFTIRQ_OFFSET;
+        preempt_count() += cnt;
        /*
         * Were softirqs turned off above:
         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
+        if (softirq_count() == cnt)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
-        if (preempt_count() == SOFTIRQ_OFFSET)
+        if (preempt_count() == cnt)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-        add_preempt_count(SOFTIRQ_OFFSET);
+        add_preempt_count(cnt);
        barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
 void local_bh_disable(void)
 {
-        __local_bh_disable((unsigned long)__builtin_return_address(0));
+        __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(local_bh_disable);
+static void __local_bh_enable(unsigned int cnt)
+{
+        WARN_ON_ONCE(in_irq());
+        WARN_ON_ONCE(!irqs_disabled());
+        if (softirq_count() == cnt)
+                trace_softirqs_on((unsigned long)__builtin_return_address(0));
+        sub_preempt_count(cnt);
+}
 /*
 * Special-case - softirqs can safely be enabled in
 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
 */
 void _local_bh_enable(void)
 {
-        WARN_ON_ONCE(in_irq());
+        __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
-        WARN_ON_ONCE(!irqs_disabled());
-        if (softirq_count() == SOFTIRQ_OFFSET)
-                trace_softirqs_on((unsigned long)__builtin_return_address(0));
-        sub_preempt_count(SOFTIRQ_OFFSET);
 }
 EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
        /*
         * Are softirqs going to be turned on now:
         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
+        if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
                trace_softirqs_on(ip);
        /*
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        sub_preempt_count(SOFTIRQ_OFFSET - 1);
+        sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
        if (unlikely(!in_interrupt() && local_softirq_pending()))
                do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
        pending = local_softirq_pending();
        account_system_vtime(current);
-        __local_bh_disable((unsigned long)__builtin_return_address(0));
+        __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
        lockdep_softirq_enter();
        cpu = smp_processor_id();
@@ -212,18 +229,20 @@ restart:
        do {
                if (pending & 1) {
+                        unsigned int vec_nr = h - softirq_vec;
                        int prev_count = preempt_count();
-                        kstat_incr_softirqs_this_cpu(h - softirq_vec);
-                        trace_softirq_entry(h, softirq_vec);
+                        kstat_incr_softirqs_this_cpu(vec_nr);
+                        trace_softirq_entry(vec_nr);
                        h->action(h);
-                        trace_softirq_exit(h, softirq_vec);
+                        trace_softirq_exit(vec_nr);
                        if (unlikely(prev_count != preempt_count())) {
-                                printk(KERN_ERR "huh, entered softirq %td %s %p"
+                                printk(KERN_ERR "huh, entered softirq %u %s %p"
                                       "with preempt_count %08x,"
-                                       " exited with %08x?\n", h - softirq_vec,
+                                       " exited with %08x?\n", vec_nr,
-                                       softirq_to_name[h - softirq_vec],
+                                       softirq_to_name[vec_nr], h->action,
-                                       h->action, prev_count, preempt_count());
+                                       prev_count, preempt_count());
                                preempt_count() = prev_count;
                        }
@@ -245,7 +264,7 @@ restart:
        lockdep_softirq_exit();
        account_system_vtime(current);
-        _local_bh_enable();
+        __local_bh_enable(SOFTIRQ_OFFSET);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,16 +298,42 @@ void irq_enter(void)
        rcu_irq_enter();
        if (idle_cpu(cpu) && !in_interrupt()) {
-                __irq_enter();
+                /*
+                 * Prevent raise_softirq from needlessly waking up ksoftirqd
+                 * here, as softirq will be serviced on return from interrupt.
+                 */
+                local_bh_disable();
                tick_check_idle(cpu);
-        } else
+                _local_bh_enable();
-                __irq_enter();
+        }
+        __irq_enter();
 }
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-# define invoke_softirq()       __do_softirq()
+static inline void invoke_softirq(void)
+{
+        if (!force_irqthreads)
+                __do_softirq();
+        else {
+                __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
+                wakeup_softirqd();
+                __local_bh_enable(SOFTIRQ_OFFSET);
+        }
+}
 #else
-# define invoke_softirq()       do_softirq()
+static inline void invoke_softirq(void)
+{
+        if (!force_irqthreads)
+                do_softirq();
+        else {
+                __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
+                wakeup_softirqd();
+                __local_bh_enable(SOFTIRQ_OFFSET);
+        }
+}
 #endif
 /*
@@ -363,8 +408,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
        local_irq_save(flags);
        t->next = NULL;
-        *__get_cpu_var(tasklet_vec).tail = t;
+        *__this_cpu_read(tasklet_vec.tail) = t;
-        __get_cpu_var(tasklet_vec).tail = &(t->next);
+        __this_cpu_write(tasklet_vec.tail, &(t->next));
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -377,8 +422,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
        local_irq_save(flags);
        t->next = NULL;
-        *__get_cpu_var(tasklet_hi_vec).tail = t;
+        *__this_cpu_read(tasklet_hi_vec.tail) = t;
-        __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+        __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
        raise_softirq_irqoff(HI_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -389,8 +434,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
        BUG_ON(!irqs_disabled());
-        t->next = __get_cpu_var(tasklet_hi_vec).head;
+        t->next = __this_cpu_read(tasklet_hi_vec.head);
-        __get_cpu_var(tasklet_hi_vec).head = t;
+        __this_cpu_write(tasklet_hi_vec.head, t);
        __raise_softirq_irqoff(HI_SOFTIRQ);
 }
@@ -401,9 +446,9 @@ static void tasklet_action(struct softirq_action *a)
        struct tasklet_struct *list;
        local_irq_disable();
-        list = __get_cpu_var(tasklet_vec).head;
+        list = __this_cpu_read(tasklet_vec.head);
-        __get_cpu_var(tasklet_vec).head = NULL;
+        __this_cpu_write(tasklet_vec.head, NULL);
-        __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+        __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
        local_irq_enable();
        while (list) {
@@ -424,8 +469,8 @@ static void tasklet_action(struct softirq_action *a)
                local_irq_disable();
                t->next = NULL;
-                *__get_cpu_var(tasklet_vec).tail = t;
+                *__this_cpu_read(tasklet_vec.tail) = t;
-                __get_cpu_var(tasklet_vec).tail = &(t->next);
+                __this_cpu_write(tasklet_vec.tail, &(t->next));
                __raise_softirq_irqoff(TASKLET_SOFTIRQ);
                local_irq_enable();
        }
@@ -436,9 +481,9 @@ static void tasklet_hi_action(struct softirq_action *a)
        struct tasklet_struct *list;
        local_irq_disable();
-        list = __get_cpu_var(tasklet_hi_vec).head;
+        list = __this_cpu_read(tasklet_hi_vec.head);
-        __get_cpu_var(tasklet_hi_vec).head = NULL;
+        __this_cpu_write(tasklet_hi_vec.head, NULL);
-        __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
+        __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
        local_irq_enable();
        while (list) {
@@ -459,8 +504,8 @@ static void tasklet_hi_action(struct softirq_action *a)
                local_irq_disable();
                t->next = NULL;
-                *__get_cpu_var(tasklet_hi_vec).tail = t;
+                *__this_cpu_read(tasklet_hi_vec.tail) = t;
-                __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+                __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
                __raise_softirq_irqoff(HI_SOFTIRQ);
                local_irq_enable();
        }
@@ -530,7 +575,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
 /**
 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
 * @ttimer:      tasklet_hrtimer which is initialized
- * @function:    hrtimer callback funtion which gets called from softirq context
+ * @function:    hrtimer callback function which gets called from softirq context
 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
 * @mode:        hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
 */
@@ -712,7 +757,10 @@ static int run_ksoftirqd(void * __bind_cpu)
                           don't process */
                        if (cpu_is_offline((long)__bind_cpu))
                                goto wait_to_die;
-                        do_softirq();
+                        local_irq_disable();
+                        if (local_softirq_pending())
+                                __do_softirq();
+                        local_irq_enable();
                        preempt_enable_no_resched();
                        cond_resched();
                        preempt_disable();
@@ -776,16 +824,16 @@ static void takeover_tasklets(unsigned int cpu)
        /* Find end, append list for that CPU. */
        if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
-                *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
+                *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
-                __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+                this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
                per_cpu(tasklet_vec, cpu).head = NULL;
                per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
        }
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
        if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
-                *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+                *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
-                __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+                __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
                per_cpu(tasklet_hi_vec, cpu).head = NULL;
                per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
        }
@@ -805,7 +853,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+                p = kthread_create_on_node(run_ksoftirqd,
+                                           hcpu,
+                                           cpu_to_node(hotcpu),
+                                           "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
                        return notifier_from_errno(PTR_ERR(p));
@@ -827,7 +878,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN: {
-                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+                static const struct sched_param param = {
+                        .sched_priority = MAX_RT_PRIO-1
+                };
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -857,25 +910,6 @@ static __init int spawn_ksoftirqd(void)
 }
 early_initcall(spawn_ksoftirqd);
-#ifdef CONFIG_SMP
-/*
- * Call a function on all processors
- */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
-{
-        int ret = 0;
-        preempt_disable();
-        ret = smp_call_function(func, info, wait);
-        local_irq_disable();
-        func(info);
-        local_irq_enable();
-        preempt_enable();
-        return ret;
-}
-EXPORT_SYMBOL(on_each_cpu);
-#endif
 /*
 * [ These __weak aliases are kept in a separate compilation unit, so that
 *   GCC does not inline them incorrectly. ]
@@ -886,17 +920,14 @@ int __init __weak early_irq_init(void)
        return 0;
 }
+#ifdef CONFIG_GENERIC_HARDIRQS
 int __init __weak arch_probe_nr_irqs(void)
 {
-        return 0;
+        return NR_IRQS_LEGACY;
 }
 int __init __weak arch_early_irq_init(void)
 {
        return 0;
 }
+#endif
-int __weak arch_init_chip_data(struct irq_desc *desc, int node)
-{
-        return 0;
-}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -46,11 +47,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 int __init_srcu_struct(struct srcu_struct *sp, const char *name,
                       struct lock_class_key *key)
 {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* Don't re-initialize a lock while it is held. */
        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
        lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
        return init_srcu_struct_fields(sp);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -157,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+/*
 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
 */
 static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -205,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * all srcu_read_lock() calls using the old counters have completed.
         * Their corresponding critical sections might well be still
         * executing, but the srcu_read_lock() primitives themselves
-         * will have finished executing.
+         * will have finished executing.  We initially give readers
+         * an arbitrarily chosen 10 microseconds to get out of their
+         * SRCU read-side critical sections, then loop waiting 1/HZ
+         * seconds per iteration.  The 10-microsecond value has done
+         * very well in testing.
         */
+        if (srcu_readers_active_idx(sp, idx))
+                udelay(SYNCHRONIZE_SRCU_READER_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
                cpu_stop_fn_t fn = work->fn;
                void *arg = work->arg;
                struct cpu_stop_done *done = work->done;
-                char ksym_buf[KSYM_NAME_LEN];
+                char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
                __set_current_state(TASK_RUNNING);
@@ -287,11 +287,12 @@ repeat:
        goto repeat;
 }
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
 static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                                           unsigned long action, void *hcpu)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        struct task_struct *p;
@@ -300,17 +301,19 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                BUG_ON(stopper->thread || stopper->enabled ||
                       !list_empty(&stopper->works));
-                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                p = kthread_create_on_node(cpu_stopper_thread,
-                                   cpu);
+                                           stopper,
+                                           cpu_to_node(cpu),
+                                           "migration/%d", cpu);
                if (IS_ERR(p))
-                        return NOTIFY_BAD;
+                        return notifier_from_errno(PTR_ERR(p));
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
                get_task_struct(p);
+                kthread_bind(p, cpu);
+                sched_set_stop_task(cpu, p);
                stopper->thread = p;
                break;
        case CPU_ONLINE:
-                kthread_bind(stopper->thread, cpu);
                /* strictly unnecessary, as first user will wake it */
                wake_up_process(stopper->thread);
                /* mark enabled */
@@ -325,6 +328,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
        {
                struct cpu_stop_work *work;
+                sched_set_stop_task(cpu, NULL);
                /* kill the stopper */
                kthread_stop(stopper->thread);
                /* drain remaining works */
@@ -370,7 +374,7 @@ static int __init cpu_stop_init(void)
        /* start one for the boot cpu */
        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
                                    bcpu);
-        BUG_ON(err == NOTIFY_BAD);
+        BUG_ON(err != NOTIFY_OK);
        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
        register_cpu_notifier(&cpu_stop_cpu_notifier);
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,12 +37,15 @@
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/kmsg_dump.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
@@ -117,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
 void (*pm_power_off_prepare)(void);
 /*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+        if (pcred->user->user_ns == cred->user->user_ns &&
+            (pcred->uid  == cred->euid ||
+             pcred->euid == cred->euid))
+                return true;
+        if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+                return true;
+        return false;
+}
+/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
        int no_nice;
-        if (pcred->uid  != cred->euid &&
+        if (!set_one_prio_perm(p)) {
-            pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
                error = -EPERM;
                goto out;
        }
@@ -285,6 +305,7 @@ out_unlock:
 */
 void emergency_restart(void)
 {
+        kmsg_dump(KMSG_DUMP_EMERG);
        machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
@@ -293,8 +314,9 @@ void kernel_restart_prepare(char *cmd)
 {
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
+        usermodehelper_disable();
        device_shutdown();
-        sysdev_shutdown();
+        syscore_shutdown();
 }
 /**
@@ -312,6 +334,7 @@ void kernel_restart(char *cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
                printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+        kmsg_dump(KMSG_DUMP_RESTART);
        machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -321,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
        blocking_notifier_call_chain(&reboot_notifier_list,
                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
        system_state = state;
+        usermodehelper_disable();
        device_shutdown();
 }
 /**
@@ -331,8 +355,9 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
-        sysdev_shutdown();
+        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
+        kmsg_dump(KMSG_DUMP_HALT);
        machine_halt();
 }
@@ -349,8 +374,9 @@ void kernel_power_off(void)
        if (pm_power_off_prepare)
                pm_power_off_prepare();
        disable_nonboot_cpus();
-        sysdev_shutdown();
+        syscore_shutdown();
        printk(KERN_EMERG "Power down.\n");
+        kmsg_dump(KMSG_DUMP_POWEROFF);
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -496,7 +522,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        if (rgid != (gid_t) -1) {
                if (old->gid == rgid ||
                    old->egid == rgid ||
-                    capable(CAP_SETGID))
+                    nsown_capable(CAP_SETGID))
                        new->gid = rgid;
                else
                        goto error;
@@ -505,7 +531,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
                if (old->gid == egid ||
                    old->egid == egid ||
                    old->sgid == egid ||
-                    capable(CAP_SETGID))
+                    nsown_capable(CAP_SETGID))
                        new->egid = egid;
                else
                        goto error;
@@ -540,7 +566,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        old = current_cred();
        retval = -EPERM;
-        if (capable(CAP_SETGID))
+        if (nsown_capable(CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = gid;
        else if (gid == old->gid || gid == old->sgid)
                new->egid = new->fsgid = gid;
@@ -607,7 +633,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                new->uid = ruid;
                if (old->uid != ruid &&
                    old->euid != ruid &&
-                    !capable(CAP_SETUID))
+                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
@@ -616,7 +642,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                if (old->uid != euid &&
                    old->euid != euid &&
                    old->suid != euid &&
-                    !capable(CAP_SETUID))
+                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
@@ -664,7 +690,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        old = current_cred();
        retval = -EPERM;
-        if (capable(CAP_SETUID)) {
+        if (nsown_capable(CAP_SETUID)) {
                new->suid = new->uid = uid;
                if (uid != old->uid) {
                        retval = set_user(new);
@@ -706,7 +732,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        old = current_cred();
        retval = -EPERM;
-        if (!capable(CAP_SETUID)) {
+        if (!nsown_capable(CAP_SETUID)) {
                if (ruid != (uid_t) -1 && ruid != old->uid &&
                    ruid != old->euid  && ruid != old->suid)
                        goto error;
@@ -770,7 +796,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        old = current_cred();
        retval = -EPERM;
-        if (!capable(CAP_SETGID)) {
+        if (!nsown_capable(CAP_SETGID)) {
                if (rgid != (gid_t) -1 && rgid != old->gid &&
                    rgid != old->egid  && rgid != old->sgid)
                        goto error;
@@ -830,7 +856,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        if (uid == old->uid  || uid == old->euid  ||
            uid == old->suid || uid == old->fsuid ||
-            capable(CAP_SETUID)) {
+            nsown_capable(CAP_SETUID)) {
                if (uid != old_fsuid) {
                        new->fsuid = uid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -863,7 +889,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        if (gid == old->gid  || gid == old->egid  ||
            gid == old->sgid || gid == old->fsgid ||
-            capable(CAP_SETGID)) {
+            nsown_capable(CAP_SETGID)) {
                if (gid != old_fsgid) {
                        new->fsgid = gid;
                        goto change_okay;
@@ -1080,8 +1106,10 @@ SYSCALL_DEFINE0(setsid)
        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
-        if (err > 0)
+        if (err > 0) {
                proc_sid_connector(group_leader);
+                sched_autogroup_create_attach(group_leader);
+        }
        return err;
 }
@@ -1169,8 +1197,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        down_write(&uts_sem);
@@ -1218,7 +1247,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
@@ -1333,6 +1362,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
+                /* Keep the capable check against init_user_ns until
+                   cgroups can contain all limits */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
@@ -1376,18 +1407,22 @@ static int check_prlimit_permission(struct task_struct *task)
 {
        const struct cred *cred = current_cred(), *tcred;
-        tcred = __task_cred(task);
+        if (current == task)
-        if ((cred->uid != tcred->euid ||
+                return 0;
-             cred->uid != tcred->suid ||
-             cred->uid != tcred->uid  ||
-             cred->gid != tcred->egid ||
-             cred->gid != tcred->sgid ||
-             cred->gid != tcred->gid) &&
-             !capable(CAP_SYS_RESOURCE)) {
-                return -EPERM;
-        }
-        return 0;
+        tcred = __task_cred(task);
+        if (cred->user->user_ns == tcred->user->user_ns &&
+            (cred->uid == tcred->euid &&
+             cred->uid == tcred->suid &&
+             cred->uid == tcred->uid  &&
+             cred->gid == tcred->egid &&
+             cred->gid == tcred->sgid &&
+             cred->gid == tcred->gid))
+                return 0;
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+                return 0;
+        return -EPERM;
 }
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bad369ec5403..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,10 +46,13 @@ cond_syscall(sys_getsockopt);
 cond_syscall(compat_sys_getsockopt);
 cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
 cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
 cond_syscall(sys_recvmsg);
 cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recv);
 cond_syscall(compat_sys_recvfrom);
 cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
@@ -68,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
 cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
 cond_syscall(sys_msgget);
 cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
 cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
 cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
 cond_syscall(sys_shmget);
 cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
 cond_syscall(sys_shmdt);
 cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
 cond_syscall(sys_mq_open);
 cond_syscall(sys_mq_unlink);
 cond_syscall(sys_mq_timedsend);
@@ -185,3 +195,8 @@ cond_syscall(sys_perf_event_open);
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/signal.h>
+#include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -55,6 +56,7 @@
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/kmod.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -116,6 +118,7 @@ static int neg_one = -1;
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
@@ -161,8 +164,6 @@ extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
-extern struct ratelimit_state printk_ratelimit_state;
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -170,8 +171,14 @@ static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 #ifdef CONFIG_MAGIC_SYSRQ
-static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
 static int sysrq_sysctl_handler(ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
@@ -194,9 +201,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
-        .count = 1,
+        {{.count = 1,
        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
+        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
        .root = &sysctl_table_root,
        .set = &sysctl_table_root.default_set,
 };
@@ -247,10 +254,6 @@ static struct ctl_table root_table[] = {
                .mode           = 0555,
                .child          = dev_table,
        },
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -261,8 +264,6 @@ static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 #ifdef CONFIG_COMPACTION
@@ -307,15 +308,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_wakeup_granularity_ns,
        },
        {
-                .procname       = "sched_shares_ratelimit",
-                .data           = &sysctl_sched_shares_ratelimit,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = sched_proc_update_handler,
-                .extra1         = &min_sched_shares_ratelimit,
-                .extra2         = &max_sched_shares_ratelimit,
-        },
-        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
                .maxlen         = sizeof(enum sched_tunable_scaling),
@@ -325,14 +317,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_sched_tunable_scaling,
        },
        {
-                .procname       = "sched_shares_thresh",
-                .data           = &sysctl_sched_shares_thresh,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-        },
-        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
@@ -354,6 +338,13 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
+                .procname       = "sched_shares_window",
+                .data           = &sysctl_sched_shares_window,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
                .procname       = "timer_migration",
                .data           = &sysctl_timer_migration,
                .maxlen         = sizeof(unsigned int),
@@ -377,13 +368,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rt_handler,
        },
+#ifdef CONFIG_SCHED_AUTOGROUP
        {
-                .procname       = "sched_compat_yield",
+                .procname       = "sched_autogroup_enabled",
-                .data           = &sysctl_sched_compat_yield,
+                .data           = &sysctl_sched_autogroup_enabled,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
@@ -622,6 +617,11 @@ static struct ctl_table kern_table[] = {
                .child          = random_table,
        },
        {
+                .procname       = "usermodehelper",
+                .mode           = 0555,
+                .child          = usermodehelper_table,
+        },
+        {
                .procname       = "overflowuid",
                .data           = &overflowuid,
                .maxlen         = sizeof(int),
@@ -704,6 +704,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &ten_thousand,
        },
+        {
+                .procname       = "dmesg_restrict",
+                .data           = &dmesg_restrict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .procname       = "kptr_restrict",
+                .data           = &kptr_restrict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dmesg_restrict,
+                .extra1         = &zero,
+                .extra2         = &two,
+        },
 #endif
        {
                .procname       = "ngroups_max",
@@ -718,14 +736,16 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog_enabled,
+                .proc_handler   = proc_dowatchdog,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
        {
                .procname       = "watchdog_thresh",
-                .data           = &softlockup_thresh,
+                .data           = &watchdog_thresh,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog_thresh,
+                .proc_handler   = proc_dowatchdog,
                .extra1         = &neg_one,
                .extra2         = &sixty,
        },
@@ -738,21 +758,23 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
        {
-                .procname       = "unknown_nmi_panic",
+                .procname       = "nmi_watchdog",
-                .data           = &unknown_nmi_panic,
+                .data           = &watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dowatchdog,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
        {
-                .procname       = "nmi_watchdog",
+                .procname       = "unknown_nmi_panic",
-                .data           = &nmi_watchdog_enabled,
+                .data           = &unknown_nmi_panic,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_nmi_enabled,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #if defined(CONFIG_X86)
@@ -916,6 +938,12 @@ static struct ctl_table kern_table[] = {
        },
 #endif
 #ifdef CONFIG_PERF_EVENTS
+        /*
+         * User-space scripts rely on the existence of this file
+         * as a feature check for perf_events being enabled.
+         *
+         * So it's an ABI, do not remove!
+         */
        {
                .procname       = "perf_event_paranoid",
                .data           = &sysctl_perf_event_paranoid,
@@ -935,7 +963,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_perf_event_sample_rate,
                .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = perf_proc_update_handler,
        },
 #endif
 #ifdef CONFIG_KMEMCHECK
@@ -956,10 +984,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -969,14 +993,18 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_overcommit_memory,
                .maxlen         = sizeof(sysctl_overcommit_memory),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
        },
        {
                .procname       = "panic_on_oom",
                .data           = &sysctl_panic_on_oom,
                .maxlen         = sizeof(sysctl_panic_on_oom),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
        },
        {
                .procname       = "oom_kill_allocating_task",
@@ -1004,7 +1032,8 @@ static struct ctl_table vm_table[] = {
                .data           = &page_cluster,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "dirty_background_ratio",
@@ -1052,7 +1081,8 @@ static struct ctl_table vm_table[] = {
                .data           = &dirty_expire_interval,
                .maxlen         = sizeof(dirty_expire_interval),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "nr_pdflush_threads",
@@ -1128,6 +1158,8 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = drop_caches_sysctl_handler,
+                .extra1         = &one,
+                .extra2         = &three,
        },
 #ifdef CONFIG_COMPACTION
        {
@@ -1320,11 +1352,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -1340,28 +1367,28 @@ static struct ctl_table fs_table[] = {
                .data           = &inodes_stat,
                .maxlen         = 2*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "inode-state",
                .data           = &inodes_stat,
                .maxlen         = 7*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "file-nr",
                .data           = &files_stat,
-                .maxlen         = 3*sizeof(int),
+                .maxlen         = sizeof(files_stat),
                .mode           = 0444,
                .proc_handler   = proc_nr_files,
        },
        {
                .procname       = "file-max",
                .data           = &files_stat.max_files,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(files_stat.max_files),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_doulongvec_minmax,
        },
        {
                .procname       = "nr_open",
@@ -1377,7 +1404,7 @@ static struct ctl_table fs_table[] = {
                .data           = &dentry_stat,
                .maxlen         = 6*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_nr_dentry,
        },
        {
                .procname       = "overflowuid",
@@ -1480,16 +1507,12 @@ static struct ctl_table fs_table[] = {
                .proc_handler   = &pipe_proc_fn,
                .extra1         = &pipe_min_size,
        },
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
 static struct ctl_table debug_table[] = {
 #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
-    defined(CONFIG_S390)
+    defined(CONFIG_S390) || defined(CONFIG_TILE)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -1567,11 +1590,16 @@ void sysctl_head_get(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
+static void free_head(struct rcu_head *rcu)
+{
+        kfree(container_of(rcu, struct ctl_table_header, rcu));
+}
 void sysctl_head_put(struct ctl_table_header *head)
 {
        spin_lock(&sysctl_lock);
        if (!--head->count)
-                kfree(head);
+                call_rcu(&head->rcu, free_head);
        spin_unlock(&sysctl_lock);
 }
@@ -1685,13 +1713,8 @@ static int test_perm(int mode, int op)
 int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
-        int error;
        int mode;
-        error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
-        if (error)
-                return error;
        if (root->permissions)
                mode = root->permissions(root, current->nsproxy, table);
        else
@@ -1948,10 +1971,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        start_unregistering(header);
        if (!--header->parent->count) {
                WARN_ON(1);
-                kfree(header->parent);
+                call_rcu(&header->parent->rcu, free_head);
        }
        if (!--header->count)
-                kfree(header);
+                call_rcu(&header->rcu, free_head);
        spin_unlock(&sysctl_lock);
 }
@@ -2392,6 +2415,17 @@ static int proc_taint(struct ctl_table *table, int write,
        return err;
 }
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
 struct do_proc_dointvec_minmax_conv_param {
        int *min;
        int *max;
@@ -2893,7 +2927,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
        }
 }
-#else /* CONFIG_PROC_FS */
+#else /* CONFIG_PROC_SYSCTL */
 int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2945,7 +2979,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_PROC_SYSCTL */
 /*
 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
        { CTL_INT,      KERN_IA64_UNALIGNED,            "ignore-unaligned-usertrap" },
        { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
        { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
-        { CTL_INT,      KERN_NMI_WATCHDOG,              "nmi_watchdog" },
        { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
        {}
 };
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
                buf[result] = '\0';
-                /* Convert the decnet addresss to binary */
+                /* Convert the decnet address to binary */
                result = -EIO;
                nodep = strchr(buf, '.') + 1;
                if (!nodep)
@@ -1322,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
        const struct bin_table *table = NULL;
-        struct nameidata nd;
        struct vfsmount *mnt;
        struct file *file;
        ssize_t result;
        char *pathname;
        int flags;
-        int acc_mode;
        pathname = sysctl_getname(name, nlen, &table);
        result = PTR_ERR(pathname);
@@ -1338,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        /* How should the sysctl be accessed? */
        if (oldval && oldlen && newval && newlen) {
                flags = O_RDWR;
-                acc_mode = MAY_READ | MAY_WRITE;
        } else if (newval && newlen) {
                flags = O_WRONLY;
-                acc_mode = MAY_WRITE;
        } else if (oldval && oldlen) {
                flags = O_RDONLY;
-                acc_mode = MAY_READ;
        } else {
                result = 0;
                goto out_putname;
        }
        mnt = current->nsproxy->pid_ns->proc_mnt;
-        result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
+        file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
-        if (result)
-                goto out_putname;
-        result = may_open(&nd.path, acc_mode, flags);
-        if (result)
-                goto out_putpath;
-        file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
        result = PTR_ERR(file);
        if (IS_ERR(file))
                goto out_putname;
@@ -1371,10 +1357,6 @@ out_putname:
        putname(pathname);
 out:
        return result;
-out_putpath:
-        path_put(&nd.path);
-        goto out_putname;
 }
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                const char *fail = NULL;
                if (table->parent) {
-                        if (table->procname && !table->parent->procname)
+                        if (!table->parent->procname)
                                set_fail(&fail, table, "Parent without procname");
                }
-                if (!table->procname)
-                        set_fail(&fail, table, "No procname");
                if (table->child) {
                        if (table->data)
                                set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                        set_fail(&fail, table, "No maxlen");
                        }
 #ifdef CONFIG_PROC_SYSCTL
-                        if (table->procname && !table->proc_handler)
+                        if (!table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
 #endif
-#if 0
-                        if (!table->procname && table->proc_handler)
-                                set_fail(&fail, table, "proc_handler without procname");
-#endif
                        sysctl_check_leaf(namespaces, table, &fail);
                }
                if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..fc0f22005417 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
                return -ENOMEM;
        if (!info) {
-                int seq = get_cpu_var(taskstats_seqnum)++;
+                int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
-                put_cpu_var(taskstats_seqnum);
                reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
        } else
@@ -175,22 +174,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
        up_write(&listeners->sem);
 }
-static int fill_pid(pid_t pid, struct task_struct *tsk,
+static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
-                struct taskstats *stats)
 {
-        int rc = 0;
-        if (!tsk) {
-                rcu_read_lock();
-                tsk = find_task_by_vpid(pid);
-                if (tsk)
-                        get_task_struct(tsk);
-                rcu_read_unlock();
-                if (!tsk)
-                        return -ESRCH;
-        } else
-                get_task_struct(tsk);
        memset(stats, 0, sizeof(*stats));
        /*
         * Each accounting subsystem adds calls to its functions to
@@ -209,17 +194,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
        /* fill in extended acct fields */
        xacct_add_tsk(stats, tsk);
+}
-        /* Define err: label here if needed */
+static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
-        put_task_struct(tsk);
+{
-        return rc;
+        struct task_struct *tsk;
+        rcu_read_lock();
+        tsk = find_task_by_vpid(pid);
+        if (tsk)
+                get_task_struct(tsk);
+        rcu_read_unlock();
+        if (!tsk)
+                return -ESRCH;
+        fill_stats(tsk, stats);
+        put_task_struct(tsk);
+        return 0;
 }
-static int fill_tgid(pid_t tgid, struct task_struct *first,
+static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
-                struct taskstats *stats)
 {
-        struct task_struct *tsk;
+        struct task_struct *tsk, *first;
        unsigned long flags;
        int rc = -ESRCH;
@@ -228,8 +223,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
         * leaders who are already counted with the dead tasks
         */
        rcu_read_lock();
-        if (!first)
+        first = find_task_by_vpid(tgid);
-                first = find_task_by_vpid(tgid);
        if (!first || !lock_task_sighand(first, &flags))
                goto out;
@@ -268,7 +262,6 @@ out:
        return rc;
 }
 static void fill_tgid_exit(struct task_struct *tsk)
 {
        unsigned long flags;
@@ -292,16 +285,18 @@ ret:
 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
 {
        struct listener_list *listeners;
-        struct listener *s, *tmp;
+        struct listener *s, *tmp, *s2;
        unsigned int cpu;
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
+        s = NULL;
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
-                        s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+                        if (!s)
-                                         cpu_to_node(cpu));
+                                s = kmalloc_node(sizeof(struct listener),
+                                                 GFP_KERNEL, cpu_to_node(cpu));
                        if (!s)
                                goto cleanup;
                        s->pid = pid;
@@ -310,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
                        listeners = &per_cpu(listener_array, cpu);
                        down_write(&listeners->sem);
+                        list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
+                                if (s2->pid == pid)
+                                        goto next_cpu;
+                        }
                        list_add(&s->list, &listeners->list);
+                        s = NULL;
+next_cpu:
                        up_write(&listeners->sem);
                }
+                kfree(s);
                return 0;
        }
@@ -355,6 +357,10 @@ static int parse(struct nlattr *na, struct cpumask *mask)
        return ret;
 }
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define TASKSTATS_NEEDS_PADDING 1
+#endif
 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
        struct nlattr *na, *ret;
@@ -364,9 +370,33 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
                        ? TASKSTATS_TYPE_AGGR_PID
                        : TASKSTATS_TYPE_AGGR_TGID;
+        /*
+         * The taskstats structure is internally aligned on 8 byte
+         * boundaries but the layout of the aggregrate reply, with
+         * two NLA headers and the pid (each 4 bytes), actually
+         * force the entire structure to be unaligned. This causes
+         * the kernel to issue unaligned access warnings on some
+         * architectures like ia64. Unfortunately, some software out there
+         * doesn't properly unroll the NLA packet and assumes that the start
+         * of the taskstats structure will always be 20 bytes from the start
+         * of the netlink payload. Aligning the start of the taskstats
+         * structure breaks this software, which we don't want. So, for now
+         * the alignment only happens on architectures that require it
+         * and those users will have to update to fixed versions of those
+         * packages. Space is reserved in the packet only when needed.
+         * This ifdef should be removed in several years e.g. 2012 once
+         * we can be confident that fixed versions are installed on most
+         * systems. We add the padding before the aggregate since the
+         * aggregate is already a defined type.
+         */
+#ifdef TASKSTATS_NEEDS_PADDING
+        if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
+                goto err;
+#endif
        na = nla_nest_start(skb, aggr);
        if (!na)
                goto err;
        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
                goto err;
        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
@@ -424,74 +454,122 @@ err:
        return rc;
 }
-static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+static int cmd_attr_register_cpumask(struct genl_info *info)
 {
-        int rc;
-        struct sk_buff *rep_skb;
-        struct taskstats *stats;
-        size_t size;
        cpumask_var_t mask;
+        int rc;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
        if (rc < 0)
-                goto free_return_rc;
+                goto out;
-        if (rc == 0) {
+        rc = add_del_listener(info->snd_pid, mask, REGISTER);
-                rc = add_del_listener(info->snd_pid, mask, REGISTER);
+out:
-                goto free_return_rc;
+        free_cpumask_var(mask);
-        }
+        return rc;
+}
+static int cmd_attr_deregister_cpumask(struct genl_info *info)
+{
+        cpumask_var_t mask;
+        int rc;
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+                return -ENOMEM;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
        if (rc < 0)
-                goto free_return_rc;
+                goto out;
-        if (rc == 0) {
+        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
-                rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+out:
-free_return_rc:
-                free_cpumask_var(mask);
-                return rc;
-        }
        free_cpumask_var(mask);
+        return rc;
+}
+static size_t taskstats_packet_size(void)
+{
+        size_t size;
-        /*
-         * Size includes space for nested attributes
-         */
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+#ifdef TASKSTATS_NEEDS_PADDING
+        size += nla_total_size(0); /* Padding for alignment */
+#endif
+        return size;
+}
+static int cmd_attr_pid(struct genl_info *info)
+{
+        struct taskstats *stats;
+        struct sk_buff *rep_skb;
+        size_t size;
+        u32 pid;
+        int rc;
+        size = taskstats_packet_size();
        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
        if (rc < 0)
                return rc;
        rc = -EINVAL;
-        if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+        pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-                u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
-                stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+        if (!stats)
-                if (!stats)
+                goto err;
-                        goto err;
+        rc = fill_stats_for_pid(pid, stats);
-                rc = fill_pid(pid, NULL, stats);
+        if (rc < 0)
-                if (rc < 0)
+                goto err;
-                        goto err;
+        return send_reply(rep_skb, info);
-        } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+err:
-                u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+        nlmsg_free(rep_skb);
-                stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+        return rc;
-                if (!stats)
+}
-                        goto err;
+static int cmd_attr_tgid(struct genl_info *info)
-                rc = fill_tgid(tgid, NULL, stats);
+{
-                if (rc < 0)
+        struct taskstats *stats;
-                        goto err;
+        struct sk_buff *rep_skb;
-        } else
+        size_t size;
+        u32 tgid;
+        int rc;
+        size = taskstats_packet_size();
+        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+        if (rc < 0)
+                return rc;
+        rc = -EINVAL;
+        tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+        if (!stats)
                goto err;
+        rc = fill_stats_for_tgid(tgid, stats);
+        if (rc < 0)
+                goto err;
        return send_reply(rep_skb, info);
 err:
        nlmsg_free(rep_skb);
        return rc;
 }
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+        if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
+                return cmd_attr_register_cpumask(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
+                return cmd_attr_deregister_cpumask(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
+                return cmd_attr_pid(info);
+        else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
+                return cmd_attr_tgid(info);
+        else
+                return -EINVAL;
+}
 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
 {
        struct signal_struct *sig = tsk->signal;
@@ -532,8 +610,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        /*
         * Size includes space for nested attributes
         */
-        size = nla_total_size(sizeof(u32)) +
+        size = taskstats_packet_size();
-                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
        is_thread_group = !!taskstats_tgid_alloc(tsk);
        if (is_thread_group) {
@@ -543,7 +620,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
                fill_tgid_exit(tsk);
        }
-        listeners = &__raw_get_cpu_var(listener_array);
+        listeners = __this_cpu_ptr(&listener_array);
        if (list_empty(&listeners->list))
                return;
@@ -555,9 +632,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (!stats)
                goto err;
-        rc = fill_pid(-1, tsk, stats);
+        fill_stats(tsk, stats);
-        if (rc < 0)
-                goto err;
        /*
         * Doesn't matter if tsk is the leader or the last group member leaving
@@ -619,7 +694,7 @@ static int __init taskstats_init(void)
                goto err_cgroup_ops;
        family_registered = 1;
-        printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+        pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
        return 0;
 err_cgroup_ops:
        genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
        int ret;
        struct kprobe *kps[2] = {&kp, &kp2};
-        kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        kp.addr = NULL;
+        kp.flags = 0;
        ret = register_kprobes(kps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
        int ret;
        struct jprobe *jps[2] = {&jp, &jp2};
-        jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        jp.kp.addr = NULL;
+        jp.kp.flags = 0;
        ret = register_jprobes(jps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
        int ret;
        struct kretprobe *rps[2] = {&rp, &rp2};
-        rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+        /* addr and flags should be cleard for reusing kprobe. */
+        rp.kp.addr = NULL;
+        rp.kp.flags = 0;
        ret = register_kretprobes(rps, 2);
        if (ret < 0) {
                printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
 * various programs will get confused when the clock gets warped.
 */
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 {
        static int firsttime = 1;
        int error = 0;
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases:
 */
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
 }
 /**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:  nsecs in u64
 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 */
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
 {
 #if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
-#if (BITS_PER_LONG < 64)
+/**
-u64 get_jiffies_64(void)
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:  nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
 {
-        unsigned long seq;
+        return (unsigned long)nsecs_to_jiffies64(n);
-        u64 ret;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                ret = jiffies_64;
-        } while (read_seqretry(&xtime_lock, seq));
-        return ret;
 }
-EXPORT_SYMBOL(get_jiffies_64);
-#endif
-EXPORT_SYMBOL(jiffies);
 /*
 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..59f369f98a04
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,720 @@
+/*
+ * Alarmtimer interface
+ *
+ * This interface provides a timer which is similarto hrtimers,
+ * but triggers a RTC alarm if the box is suspend.
+ *
+ * This interface is influenced by the Android RTC Alarm timer
+ * interface.
+ *
+ * Copyright (C) 2010 IBM Corperation
+ *
+ * Author: John Stultz <john.stultz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/timerqueue.h>
+#include <linux/rtc.h>
+#include <linux/alarmtimer.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/posix-timers.h>
+#include <linux/workqueue.h>
+#include <linux/freezer.h>
+/**
+ * struct alarm_base - Alarm timer bases
+ * @lock:               Lock for syncrhonized access to the base
+ * @timerqueue:         Timerqueue head managing the list of events
+ * @timer:              hrtimer used to schedule events while running
+ * @gettime:            Function to read the time correlating to the base
+ * @base_clockid:       clockid for the base
+ */
+static struct alarm_base {
+        spinlock_t              lock;
+        struct timerqueue_head  timerqueue;
+        struct hrtimer          timer;
+        ktime_t                 (*gettime)(void);
+        clockid_t               base_clockid;
+} alarm_bases[ALARM_NUMTYPE];
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+#ifdef CONFIG_RTC_CLASS
+/* rtc timer and device for setting alarm wakeups at suspend */
+static struct rtc_timer         rtctimer;
+static struct rtc_device        *rtcdev;
+static DEFINE_SPINLOCK(rtcdev_lock);
+/**
+ * has_wakealarm - check rtc device has wakealarm ability
+ * @dev: current device
+ * @name_ptr: name to be returned
+ *
+ * This helper function checks to see if the rtc device can wake
+ * from suspend.
+ */
+static int has_wakealarm(struct device *dev, void *name_ptr)
+{
+        struct rtc_device *candidate = to_rtc_device(dev);
+        if (!candidate->ops->set_alarm)
+                return 0;
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
+        *(const char **)name_ptr = dev_name(dev);
+        return 1;
+}
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+static struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+        struct device *dev;
+        char *str;
+        unsigned long flags;
+        struct rtc_device *ret;
+        spin_lock_irqsave(&rtcdev_lock, flags);
+        if (!rtcdev) {
+                /* Find an rtc device and init the rtc_timer */
+                dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+                /* If we have a device then str is valid. See has_wakealarm() */
+                if (dev) {
+                        rtcdev = rtc_class_open(str);
+                        /*
+                         * Drop the reference we got in class_find_device,
+                         * rtc_open takes its own.
+                         */
+                        put_device(dev);
+                        rtc_timer_init(&rtctimer, NULL, NULL);
+                }
+        }
+        ret = rtcdev;
+        spin_unlock_irqrestore(&rtcdev_lock, flags);
+        return ret;
+}
+#else
+#define alarmtimer_get_rtcdev() (0)
+#define rtcdev (0)
+#endif
+/**
+ * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
+ * @base: pointer to the base where the timer is being run
+ * @alarm: pointer to alarm being enqueued.
+ *
+ * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * an hrtimer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
+{
+        timerqueue_add(&base->timerqueue, &alarm->node);
+        if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
+                hrtimer_try_to_cancel(&base->timer);
+                hrtimer_start(&base->timer, alarm->node.expires,
+                                HRTIMER_MODE_ABS);
+        }
+}
+/**
+ * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * @base: pointer to the base where the timer is running
+ * @alarm: pointer to alarm being removed
+ *
+ * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * a new timer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+{
+        struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
+        timerqueue_del(&base->timerqueue, &alarm->node);
+        if (next == &alarm->node) {
+                hrtimer_try_to_cancel(&base->timer);
+                next = timerqueue_getnext(&base->timerqueue);
+                if (!next)
+                        return;
+                hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
+        }
+}
+/**
+ * alarmtimer_fired - Handles alarm hrtimer being fired.
+ * @timer: pointer to hrtimer being run
+ *
+ * When a alarm timer fires, this runs through the timerqueue to
+ * see which alarms expired, and runs those. If there are more alarm
+ * timers queued for the future, we set the hrtimer to fire when
+ * when the next future alarm timer expires.
+ */
+static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
+{
+        struct alarm_base *base = container_of(timer, struct alarm_base, timer);
+        struct timerqueue_node *next;
+        unsigned long flags;
+        ktime_t now;
+        int ret = HRTIMER_NORESTART;
+        spin_lock_irqsave(&base->lock, flags);
+        now = base->gettime();
+        while ((next = timerqueue_getnext(&base->timerqueue))) {
+                struct alarm *alarm;
+                ktime_t expired = next->expires;
+                if (expired.tv64 >= now.tv64)
+                        break;
+                alarm = container_of(next, struct alarm, node);
+                timerqueue_del(&base->timerqueue, &alarm->node);
+                alarm->enabled = 0;
+                /* Re-add periodic timers */
+                if (alarm->period.tv64) {
+                        alarm->node.expires = ktime_add(expired, alarm->period);
+                        timerqueue_add(&base->timerqueue, &alarm->node);
+                        alarm->enabled = 1;
+                }
+                spin_unlock_irqrestore(&base->lock, flags);
+                if (alarm->function)
+                        alarm->function(alarm);
+                spin_lock_irqsave(&base->lock, flags);
+        }
+        if (next) {
+                hrtimer_set_expires(&base->timer, next->expires);
+                ret = HRTIMER_RESTART;
+        }
+        spin_unlock_irqrestore(&base->lock, flags);
+        return ret;
+}
+#ifdef CONFIG_RTC_CLASS
+/**
+ * alarmtimer_suspend - Suspend time callback
+ * @dev: unused
+ * @state: unused
+ *
+ * When we are going into suspend, we look through the bases
+ * to see which is the soonest timer to expire. We then
+ * set an rtc timer to fire that far into the future, which
+ * will wake us from suspend.
+ */
+static int alarmtimer_suspend(struct device *dev)
+{
+        struct rtc_time tm;
+        ktime_t min, now;
+        unsigned long flags;
+        struct rtc_device *rtc;
+        int i;
+        spin_lock_irqsave(&freezer_delta_lock, flags);
+        min = freezer_delta;
+        freezer_delta = ktime_set(0, 0);
+        spin_unlock_irqrestore(&freezer_delta_lock, flags);
+        rtc = rtcdev;
+        /* If we have no rtcdev, just return */
+        if (!rtc)
+                return 0;
+        /* Find the soonest timer to expire*/
+        for (i = 0; i < ALARM_NUMTYPE; i++) {
+                struct alarm_base *base = &alarm_bases[i];
+                struct timerqueue_node *next;
+                ktime_t delta;
+                spin_lock_irqsave(&base->lock, flags);
+                next = timerqueue_getnext(&base->timerqueue);
+                spin_unlock_irqrestore(&base->lock, flags);
+                if (!next)
+                        continue;
+                delta = ktime_sub(next->expires, base->gettime());
+                if (!min.tv64 || (delta.tv64 < min.tv64))
+                        min = delta;
+        }
+        if (min.tv64 == 0)
+                return 0;
+        /* XXX - Should we enforce a minimum sleep time? */
+        WARN_ON(min.tv64 < NSEC_PER_SEC);
+        /* Setup an rtc timer to fire that far in the future */
+        rtc_timer_cancel(rtc, &rtctimer);
+        rtc_read_time(rtc, &tm);
+        now = rtc_tm_to_ktime(tm);
+        now = ktime_add(now, min);
+        rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+        return 0;
+}
+#else
+static int alarmtimer_suspend(struct device *dev)
+{
+        return 0;
+}
+#endif
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+        ktime_t delta;
+        unsigned long flags;
+        struct alarm_base *base = &alarm_bases[type];
+        delta = ktime_sub(absexp, base->gettime());
+        spin_lock_irqsave(&freezer_delta_lock, flags);
+        if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+                freezer_delta = delta;
+        spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
+/**
+ * alarm_init - Initialize an alarm structure
+ * @alarm: ptr to alarm to be initialized
+ * @type: the type of the alarm
+ * @function: callback that is run when the alarm fires
+ */
+void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+                void (*function)(struct alarm *))
+{
+        timerqueue_init(&alarm->node);
+        alarm->period = ktime_set(0, 0);
+        alarm->function = function;
+        alarm->type = type;
+        alarm->enabled = 0;
+}
+/**
+ * alarm_start - Sets an alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time to run the alarm
+ * @period: period at which the alarm will recur
+ */
+void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        unsigned long flags;
+        spin_lock_irqsave(&base->lock, flags);
+        if (alarm->enabled)
+                alarmtimer_remove(base, alarm);
+        alarm->node.expires = start;
+        alarm->period = period;
+        alarmtimer_enqueue(base, alarm);
+        alarm->enabled = 1;
+        spin_unlock_irqrestore(&base->lock, flags);
+}
+/**
+ * alarm_cancel - Tries to cancel an alarm timer
+ * @alarm: ptr to alarm to be canceled
+ */
+void alarm_cancel(struct alarm *alarm)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        unsigned long flags;
+        spin_lock_irqsave(&base->lock, flags);
+        if (alarm->enabled)
+                alarmtimer_remove(base, alarm);
+        alarm->enabled = 0;
+        spin_unlock_irqrestore(&base->lock, flags);
+}
+/**
+ * clock2alarm - helper that converts from clockid to alarmtypes
+ * @clockid: clockid.
+ */
+static enum alarmtimer_type clock2alarm(clockid_t clockid)
+{
+        if (clockid == CLOCK_REALTIME_ALARM)
+                return ALARM_REALTIME;
+        if (clockid == CLOCK_BOOTTIME_ALARM)
+                return ALARM_BOOTTIME;
+        return -1;
+}
+/**
+ * alarm_handle_timer - Callback for posix timers
+ * @alarm: alarm that fired
+ *
+ * Posix timer callback for expired alarm timers.
+ */
+static void alarm_handle_timer(struct alarm *alarm)
+{
+        struct k_itimer *ptr = container_of(alarm, struct k_itimer,
+                                                it.alarmtimer);
+        if (posix_timer_event(ptr, 0) != 0)
+                ptr->it_overrun++;
+}
+/**
+ * alarm_clock_getres - posix getres interface
+ * @which_clock: clockid
+ * @tp: timespec to fill
+ *
+ * Returns the granularity of underlying alarm base clock
+ */
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+        clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        return hrtimer_get_res(baseid, tp);
+}
+/**
+ * alarm_clock_get - posix clock_get interface
+ * @which_clock: clockid
+ * @tp: timespec to fill.
+ *
+ * Provides the underlying alarm base time.
+ */
+static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        *tp = ktime_to_timespec(base->gettime());
+        return 0;
+}
+/**
+ * alarm_timer_create - posix timer_create interface
+ * @new_timer: k_itimer pointer to manage
+ *
+ * Initializes the k_itimer structure.
+ */
+static int alarm_timer_create(struct k_itimer *new_timer)
+{
+        enum  alarmtimer_type type;
+        struct alarm_base *base;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        if (!capable(CAP_WAKE_ALARM))
+                return -EPERM;
+        type = clock2alarm(new_timer->it_clock);
+        base = &alarm_bases[type];
+        alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
+        return 0;
+}
+/**
+ * alarm_timer_get - posix timer_get interface
+ * @new_timer: k_itimer pointer
+ * @cur_setting: itimerspec data to fill
+ *
+ * Copies the itimerspec data out from the k_itimer
+ */
+static void alarm_timer_get(struct k_itimer *timr,
+                                struct itimerspec *cur_setting)
+{
+        cur_setting->it_interval =
+                        ktime_to_timespec(timr->it.alarmtimer.period);
+        cur_setting->it_value =
+                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
+        return;
+}
+/**
+ * alarm_timer_del - posix timer_del interface
+ * @timr: k_itimer pointer to be deleted
+ *
+ * Cancels any programmed alarms for the given timer.
+ */
+static int alarm_timer_del(struct k_itimer *timr)
+{
+        if (!rtcdev)
+                return -ENOTSUPP;
+        alarm_cancel(&timr->it.alarmtimer);
+        return 0;
+}
+/**
+ * alarm_timer_set - posix timer_set interface
+ * @timr: k_itimer pointer to be deleted
+ * @flags: timer flags
+ * @new_setting: itimerspec to be used
+ * @old_setting: itimerspec being replaced
+ *
+ * Sets the timer to new_setting, and starts the timer.
+ */
+static int alarm_timer_set(struct k_itimer *timr, int flags,
+                                struct itimerspec *new_setting,
+                                struct itimerspec *old_setting)
+{
+        if (!rtcdev)
+                return -ENOTSUPP;
+        /* Save old values */
+        old_setting->it_interval =
+                        ktime_to_timespec(timr->it.alarmtimer.period);
+        old_setting->it_value =
+                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
+        /* If the timer was already set, cancel it */
+        alarm_cancel(&timr->it.alarmtimer);
+        /* start the timer */
+        alarm_start(&timr->it.alarmtimer,
+                        timespec_to_ktime(new_setting->it_value),
+                        timespec_to_ktime(new_setting->it_interval));
+        return 0;
+}
+/**
+ * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
+ * @alarm: ptr to alarm that fired
+ *
+ * Wakes up the task that set the alarmtimer
+ */
+static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
+{
+        struct task_struct *task = (struct task_struct *)alarm->data;
+        alarm->data = NULL;
+        if (task)
+                wake_up_process(task);
+}
+/**
+ * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
+ * @alarm: ptr to alarmtimer
+ * @absexp: absolute expiration time
+ *
+ * Sets the alarm timer and sleeps until it is fired or interrupted.
+ */
+static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
+{
+        alarm->data = (void *)current;
+        do {
+                set_current_state(TASK_INTERRUPTIBLE);
+                alarm_start(alarm, absexp, ktime_set(0, 0));
+                if (likely(alarm->data))
+                        schedule();
+                alarm_cancel(alarm);
+        } while (alarm->data && !signal_pending(current));
+        __set_current_state(TASK_RUNNING);
+        return (alarm->data == NULL);
+}
+/**
+ * update_rmtp - Update remaining timespec value
+ * @exp: expiration time
+ * @type: timer type
+ * @rmtp: user pointer to remaining timepsec value
+ *
+ * Helper function that fills in rmtp value with time between
+ * now and the exp value
+ */
+static int update_rmtp(ktime_t exp, enum  alarmtimer_type type,
+                        struct timespec __user *rmtp)
+{
+        struct timespec rmt;
+        ktime_t rem;
+        rem = ktime_sub(exp, alarm_bases[type].gettime());
+        if (rem.tv64 <= 0)
+                return 0;
+        rmt = ktime_to_timespec(rem);
+        if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+                return -EFAULT;
+        return 1;
+}
+/**
+ * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
+ * @restart: ptr to restart block
+ *
+ * Handles restarted clock_nanosleep calls
+ */
+static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
+{
+        enum  alarmtimer_type type = restart->nanosleep.clockid;
+        ktime_t exp;
+        struct timespec __user  *rmtp;
+        struct alarm alarm;
+        int ret = 0;
+        exp.tv64 = restart->nanosleep.expires;
+        alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+        if (alarmtimer_do_nsleep(&alarm, exp))
+                goto out;
+        if (freezing(current))
+                alarmtimer_freezerset(exp, type);
+        rmtp = restart->nanosleep.rmtp;
+        if (rmtp) {
+                ret = update_rmtp(exp, type, rmtp);
+                if (ret <= 0)
+                        goto out;
+        }
+        /* The other values in restart are already filled in */
+        ret = -ERESTART_RESTARTBLOCK;
+out:
+        return ret;
+}
+/**
+ * alarm_timer_nsleep - alarmtimer nanosleep
+ * @which_clock: clockid
+ * @flags: determins abstime or relative
+ * @tsreq: requested sleep time (abs or rel)
+ * @rmtp: remaining sleep time saved
+ *
+ * Handles clock_nanosleep calls against _ALARM clockids
+ */
+static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
+                     struct timespec *tsreq, struct timespec __user *rmtp)
+{
+        enum  alarmtimer_type type = clock2alarm(which_clock);
+        struct alarm alarm;
+        ktime_t exp;
+        int ret = 0;
+        struct restart_block *restart;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        if (!capable(CAP_WAKE_ALARM))
+                return -EPERM;
+        alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+        exp = timespec_to_ktime(*tsreq);
+        /* Convert (if necessary) to absolute time */
+        if (flags != TIMER_ABSTIME) {
+                ktime_t now = alarm_bases[type].gettime();
+                exp = ktime_add(now, exp);
+        }
+        if (alarmtimer_do_nsleep(&alarm, exp))
+                goto out;
+        if (freezing(current))
+                alarmtimer_freezerset(exp, type);
+        /* abs timers don't set remaining time or restart */
+        if (flags == TIMER_ABSTIME) {
+                ret = -ERESTARTNOHAND;
+                goto out;
+        }
+        if (rmtp) {
+                ret = update_rmtp(exp, type, rmtp);
+                if (ret <= 0)
+                        goto out;
+        }
+        restart = &current_thread_info()->restart_block;
+        restart->fn = alarm_timer_nsleep_restart;
+        restart->nanosleep.clockid = type;
+        restart->nanosleep.expires = exp.tv64;
+        restart->nanosleep.rmtp = rmtp;
+        ret = -ERESTART_RESTARTBLOCK;
+out:
+        return ret;
+}
+/* Suspend hook structures */
+static const struct dev_pm_ops alarmtimer_pm_ops = {
+        .suspend = alarmtimer_suspend,
+};
+static struct platform_driver alarmtimer_driver = {
+        .driver = {
+                .name = "alarmtimer",
+                .pm = &alarmtimer_pm_ops,
+        }
+};
+/**
+ * alarmtimer_init - Initialize alarm timer code
+ *
+ * This function initializes the alarm bases and registers
+ * the posix clock ids.
+ */
+static int __init alarmtimer_init(void)
+{
+        int error = 0;
+        int i;
+        struct k_clock alarm_clock = {
+                .clock_getres   = alarm_clock_getres,
+                .clock_get      = alarm_clock_get,
+                .timer_create   = alarm_timer_create,
+                .timer_set      = alarm_timer_set,
+                .timer_del      = alarm_timer_del,
+                .timer_get      = alarm_timer_get,
+                .nsleep         = alarm_timer_nsleep,
+        };
+        posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+        posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+        /* Initialize alarm bases */
+        alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
+        alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+        alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
+        alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+        for (i = 0; i < ALARM_NUMTYPE; i++) {
+                timerqueue_init_head(&alarm_bases[i].timerqueue);
+                spin_lock_init(&alarm_bases[i].lock);
+                hrtimer_init(&alarm_bases[i].timer,
+                                alarm_bases[i].base_clockid,
+                                HRTIMER_MODE_ABS);
+                alarm_bases[i].timer.function = alarmtimer_fired;
+        }
+        error = platform_driver_register(&alarmtimer_driver);
+        platform_device_register_simple("alarmtimer", -1, NULL, 0);
+        return error;
+}
+device_initcall(alarmtimer_init);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
-#include <linux/tick.h>
 #include "tick-internal.h"
@@ -183,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
        unsigned long flags;
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-        BUG_ON(!dev->cpumask);
+        if (!dev->cpumask) {
+                WARN_ON(num_possible_cpus() > 1);
+                dev->cpumask = cpumask_of(smp_processor_id());
+        }
        raw_spin_lock_irqsave(&clockevents_lock, flags);
@@ -195,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
+static void clockevents_config(struct clock_event_device *dev,
+                               u32 freq)
+{
+        u64 sec;
+        if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                return;
+        /*
+         * Calculate the maximum number of seconds we can sleep. Limit
+         * to 10 minutes for hardware which can program more than
+         * 32bit ticks so we still get reasonable conversion values.
+         */
+        sec = dev->max_delta_ticks;
+        do_div(sec, freq);
+        if (!sec)
+                sec = 1;
+        else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
+                sec = 600;
+        clockevents_calc_mult_shift(dev, freq, sec);
+        dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
+        dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+}
+/**
+ * clockevents_config_and_register - Configure and register a clock event device
+ * @dev:        device to register
+ * @freq:       The clock frequency
+ * @min_delta:  The minimum clock ticks to program in oneshot mode
+ * @max_delta:  The maximum clock ticks to program in oneshot mode
+ *
+ * min/max_delta can be 0 for devices which do not support oneshot mode.
+ */
+void clockevents_config_and_register(struct clock_event_device *dev,
+                                     u32 freq, unsigned long min_delta,
+                                     unsigned long max_delta)
+{
+        dev->min_delta_ticks = min_delta;
+        dev->max_delta_ticks = max_delta;
+        clockevents_config(dev, freq);
+        clockevents_register_device(dev);
+}
+/**
+ * clockevents_update_freq - Update frequency and reprogram a clock event device.
+ * @dev:        device to modify
+ * @freq:       new device frequency
+ *
+ * Reconfigure and reprogram a clock event device in oneshot
+ * mode. Must be called on the cpu for which the device delivers per
+ * cpu timer events with interrupts disabled!  Returns 0 on success,
+ * -ETIME when the event is in the past.
+ */
+int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+        clockevents_config(dev, freq);
+        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+                return 0;
+        return clockevents_program_event(dev, dev->next_event, ktime_get());
+}
 /*
 * Noop handler when we shut down an event device
 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * @shift:      pointer to shift variable
 * @from:       frequency to convert from
 * @to:         frequency to convert to
- * @minsec:     guaranteed runtime conversion range in seconds
+ * @maxsec:     guaranteed runtime conversion range in seconds
 *
 * The function evaluates the shift/mult pair for the scaled math
 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
 * event @to is the counter frequency and @from is NSEC_PER_SEC.
 *
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
 * seconds which must be covered by the runtime conversion with the
 * calculated mult and shift factors. This guarantees that no 64bit
 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * factors.
 */
 void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 {
        u64 tmp;
        u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
         * Calculate the shift factor which is limiting the conversion
         * range:
         */
-        tmp = ((u64)minsec * from) >> 32;
+        tmp = ((u64)maxsec * from) >> 32;
        while (tmp) {
                tmp >>=1;
                sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
         */
        for (sft = 32; sft > 0; sft--) {
                tmp = (u64) to << sft;
+                tmp += from / 2;
                do_div(tmp, from);
                if ((tmp >> sftacc) == 0)
                        break;
@@ -184,7 +185,6 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
-static cycle_t watchdog_last;
 static int watchdog_running;
 static int clocksource_watchdog_kthread(void *data);
@@ -253,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
        if (!watchdog_running)
                goto out;
-        wdnow = watchdog->read(watchdog);
-        wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
-                                     watchdog->mult, watchdog->shift);
-        watchdog_last = wdnow;
        list_for_each_entry(cs, &watchdog_list, wd_list) {
                /* Clocksource already marked unstable? */
@@ -267,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
                        continue;
                }
+                local_irq_disable();
                csnow = cs->read(cs);
+                wdnow = watchdog->read(watchdog);
+                local_irq_enable();
                /* Clocksource initialized ? */
                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
-                        cs->wd_last = csnow;
+                        cs->wd_last = wdnow;
+                        cs->cs_last = csnow;
                        continue;
                }
-                /* Check the deviation from the watchdog clocksource. */
+                wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
-                cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+                                             watchdog->mult, watchdog->shift);
+                cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
                                             cs->mask, cs->mult, cs->shift);
-                cs->wd_last = csnow;
+                cs->cs_last = csnow;
+                cs->wd_last = wdnow;
+                /* Check the deviation from the watchdog clocksource. */
                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
                        clocksource_unstable(cs, cs_nsec - wd_nsec);
                        continue;
@@ -317,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
                return;
        init_timer(&watchdog_timer);
        watchdog_timer.function = clocksource_watchdog;
-        watchdog_last = watchdog->read(watchdog);
        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
        add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
        watchdog_running = 1;
@@ -625,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs)
        list_add(&cs->list, entry);
 }
-/*
- * Maximum time we expect to go between ticks. This includes idle
- * tickless time. It provides the trade off between selecting a
- * mult/shift pair that is very precise but can only handle a short
- * period of time, vs. a mult/shift pair that can handle long periods
- * of time but isn't as precise.
- *
- * This is a subsystem constant, and actual hardware limitations
- * may override it (ie: clocksources that wrap every 3 seconds).
- */
-#define MAX_UPDATE_LENGTH 5 /* Seconds */
 /**
 * __clocksource_updatefreq_scale - Used update clocksource with new freq
 * @t:          clocksource to be registered
@@ -651,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs)
 */
 void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
+        u64 sec;
        /*
-         * Ideally we want to use  some of the limits used in
+         * Calc the maximum number of seconds which we can run before
-         * clocksource_max_deferment, to provide a more informed
+         * wrapping around. For clocksources which have a mask > 32bit
-         * MAX_UPDATE_LENGTH. But for now this just gets the
+         * we need to limit the max sleep time to have a good
-         * register interface working properly.
+         * conversion precision. 10 minutes is still a reasonable
+         * amount. That results in a shift value of 24 for a
+         * clocksource with mask >= 40bit and f >= 4GHz. That maps to
+         * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
+         * margin as we do in clocksource_max_deferment()
         */
+        sec = (cs->mask - (cs->mask >> 5));
+        do_div(sec, freq);
+        do_div(sec, scale);
+        if (!sec)
+                sec = 1;
+        else if (sec > 600 && cs->mask > UINT_MAX)
+                sec = 600;
        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
-                                      NSEC_PER_SEC/scale,
+                               NSEC_PER_SEC / scale, sec * scale);
-                                      MAX_UPDATE_LENGTH*scale);
        cs->max_idle_ns = clocksource_max_deferment(cs);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -678,14 +681,14 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
-        /* Intialize mult/shift and max_idle_ns */
+        /* Initialize mult/shift and max_idle_ns */
        __clocksource_updatefreq_scale(cs, scale, freq);
        /* Add clocksource to the clcoksource list */
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
-        clocksource_select();
        clocksource_enqueue_watchdog(cs);
+        clocksource_select();
        mutex_unlock(&clocksource_mutex);
        return 0;
 }
@@ -705,8 +708,8 @@ int clocksource_register(struct clocksource *cs)
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
-        clocksource_select();
        clocksource_enqueue_watchdog(cs);
+        clocksource_select();
        mutex_unlock(&clocksource_mutex);
        return 0;
 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
 ************************************************************************/
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
+#include <linux/module.h>
 #include <linux/init.h>
+#include "tick-internal.h"
 /* The Jiffies based clocksource is the lowest common
 * denominator clock source which should function on
 * all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
 * inaccuracies caused by missed or lost timer
 * interrupts and the inability for the timer
 * interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not reccomended
+ * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
 #define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
        .shift          = JIFFIES_SHIFT,
 };
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+        unsigned long seq;
+        u64 ret;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                ret = jiffies_64;
+        } while (read_seqretry(&xtime_lock, seq));
+        return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+EXPORT_SYMBOL(jiffies);
 static int __init init_jiffies_clocksource(void)
 {
        return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,9 @@
 #include <linux/timex.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/module.h>
+#include "tick-internal.h"
 /*
 * NTP timekeeping variables:
@@ -74,6 +77,162 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)    */
 static s64                      ntp_tick_adj;
+#ifdef CONFIG_NTP_PPS
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID       10      /* PPS signal watchdog max (s) */
+#define PPS_POPCORN     4       /* popcorn spike threshold (shift) */
+#define PPS_INTMIN      2       /* min freq interval (s) (shift) */
+#define PPS_INTMAX      8       /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT    4       /* number of consecutive good intervals to
+                                   increase pps_shift or consecutive bad
+                                   intervals to decrease it */
+#define PPS_MAXWANDER   100000  /* max PPS freq wander (ns/s) */
+static int pps_valid;           /* signal watchdog counter */
+static long pps_tf[3];          /* phase median filter */
+static long pps_jitter;         /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift;           /* current interval duration (s) (shift) */
+static int pps_intcnt;          /* interval counter */
+static s64 pps_freq;            /* frequency offset (scaled ns/s) */
+static long pps_stabil;         /* current stability (scaled ns/s) */
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;         /* calibration intervals */
+static long pps_jitcnt;         /* jitter limit exceeded */
+static long pps_stbcnt;         /* stability limit exceeded */
+static long pps_errcnt;         /* calibration errors */
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+        if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+                return offset;
+        else
+                return shift_right(offset, SHIFT_PLL + time_constant);
+}
+static inline void pps_reset_freq_interval(void)
+{
+        /* the PPS calibration interval may end
+           surprisingly early */
+        pps_shift = PPS_INTMIN;
+        pps_intcnt = 0;
+}
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+        pps_reset_freq_interval();
+        pps_tf[0] = 0;
+        pps_tf[1] = 0;
+        pps_tf[2] = 0;
+        pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+        pps_freq = 0;
+}
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+        if (pps_valid > 0)
+                pps_valid--;
+        else {
+                time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+                                 STA_PPSWANDER | STA_PPSERROR);
+                pps_clear();
+        }
+}
+static inline void pps_set_freq(s64 freq)
+{
+        pps_freq = freq;
+}
+static inline int is_error_status(int status)
+{
+        return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+                /* PPS signal lost when either PPS time or
+                 * PPS frequency synchronization requested
+                 */
+                || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+                        && !(time_status & STA_PPSSIGNAL))
+                /* PPS jitter exceeded when
+                 * PPS time synchronization requested */
+                || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+                        == (STA_PPSTIME|STA_PPSJITTER))
+                /* PPS wander exceeded or calibration error when
+                 * PPS frequency synchronization requested
+                 */
+                || ((time_status & STA_PPSFREQ)
+                        && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+static inline void pps_fill_timex(struct timex *txc)
+{
+        txc->ppsfreq       = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+                                         PPM_SCALE_INV, NTP_SCALE_SHIFT);
+        txc->jitter        = pps_jitter;
+        if (!(time_status & STA_NANO))
+                txc->jitter /= NSEC_PER_USEC;
+        txc->shift         = pps_shift;
+        txc->stabil        = pps_stabil;
+        txc->jitcnt        = pps_jitcnt;
+        txc->calcnt        = pps_calcnt;
+        txc->errcnt        = pps_errcnt;
+        txc->stbcnt        = pps_stbcnt;
+}
+#else /* !CONFIG_NTP_PPS */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+        return shift_right(offset, SHIFT_PLL + time_constant);
+}
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+static inline int is_error_status(int status)
+{
+        return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+static inline void pps_fill_timex(struct timex *txc)
+{
+        /* PPS is not implemented, so these are zero */
+        txc->ppsfreq       = 0;
+        txc->jitter        = 0;
+        txc->shift         = 0;
+        txc->stabil        = 0;
+        txc->jitcnt        = 0;
+        txc->calcnt        = 0;
+        txc->errcnt        = 0;
+        txc->stbcnt        = 0;
+}
+#endif /* CONFIG_NTP_PPS */
 /*
 * NTP methods:
 */
@@ -149,10 +308,18 @@ static void ntp_update_offset(long offset)
        time_reftime = get_seconds();
        offset64    = offset;
-        freq_adj    = (offset64 * secs) <<
+        freq_adj    = ntp_update_offset_fll(offset64, secs);
-                        (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
-        freq_adj    += ntp_update_offset_fll(offset64, secs);
+        /*
+         * Clamp update interval to reduce PLL gain with low
+         * sampling rate (e.g. intermittent network connection)
+         * to avoid instability.
+         */
+        if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+                secs = 1 << (SHIFT_PLL + 1 + time_constant);
+        freq_adj    += (offset64 * secs) <<
+                        (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
        freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
@@ -177,6 +344,9 @@ void ntp_clear(void)
        tick_length     = tick_length_base;
        time_offset     = 0;
+        /* Clear PPS state variables */
+        pps_clear();
 }
 /*
@@ -242,16 +412,16 @@ void second_overflow(void)
                time_status |= STA_UNSYNC;
        }
-        /*
+        /* Compute the phase adjustment for the next second */
-         * Compute the phase adjustment for the next second. The offset is
-         * reduced by a fixed factor times the time constant.
-         */
        tick_length      = tick_length_base;
-        delta            = shift_right(time_offset, SHIFT_PLL + time_constant);
+        delta            = ntp_offset_chunk(time_offset);
        time_offset     -= delta;
        tick_length     += delta;
+        /* Check PPS signal */
+        pps_dec_valid();
        if (!time_adjust)
                return;
@@ -361,6 +531,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
                time_state = TIME_OK;
                time_status = STA_UNSYNC;
+                /* restart PPS frequency calibration */
+                pps_reset_freq_interval();
        }
        /*
@@ -410,6 +582,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
                time_freq = txc->freq * PPM_SCALE;
                time_freq = min(time_freq, MAXFREQ_SCALED);
                time_freq = max(time_freq, -MAXFREQ_SCALED);
+                /* update pps_freq */
+                pps_set_freq(time_freq);
        }
        if (txc->modes & ADJ_MAXERROR)
@@ -474,6 +648,19 @@ int do_adjtimex(struct timex *txc)
                        hrtimer_cancel(&leap_timer);
        }
+        if (txc->modes & ADJ_SETOFFSET) {
+                struct timespec delta;
+                delta.tv_sec  = txc->time.tv_sec;
+                delta.tv_nsec = txc->time.tv_usec;
+                if (!capable(CAP_SYS_TIME))
+                        return -EPERM;
+                if (!(txc->modes & ADJ_NANO))
+                        delta.tv_nsec *= 1000;
+                result = timekeeping_inject_offset(&delta);
+                if (result)
+                        return result;
+        }
        getnstimeofday(&ts);
        write_seqlock_irq(&xtime_lock);
@@ -500,7 +687,8 @@ int do_adjtimex(struct timex *txc)
        }
        result = time_state;    /* mostly `TIME_OK' */
-        if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+        /* check for errors */
+        if (is_error_status(time_status))
                result = TIME_ERROR;
        txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -514,15 +702,8 @@ int do_adjtimex(struct timex *txc)
        txc->tick          = tick_usec;
        txc->tai           = time_tai;
-        /* PPS is not implemented, so these are zero */
+        /* fill PPS status fields */
-        txc->ppsfreq       = 0;
+        pps_fill_timex(txc);
-        txc->jitter        = 0;
-        txc->shift         = 0;
-        txc->stabil        = 0;
-        txc->jitcnt        = 0;
-        txc->calcnt        = 0;
-        txc->errcnt        = 0;
-        txc->stbcnt        = 0;
        write_sequnlock_irq(&xtime_lock);
@@ -536,6 +717,243 @@ int do_adjtimex(struct timex *txc)
        return result;
 }
+#ifdef  CONFIG_NTP_PPS
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+        __kernel_time_t sec;    /* seconds */
+        long            nsec;   /* nanoseconds */
+};
+/* normalize the timestamp so that nsec is in the
+   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+        struct pps_normtime norm = {
+                .sec = ts.tv_sec,
+                .nsec = ts.tv_nsec
+        };
+        if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+                norm.nsec -= NSEC_PER_SEC;
+                norm.sec++;
+        }
+        return norm;
+}
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+        *jitter = pps_tf[0] - pps_tf[1];
+        if (*jitter < 0)
+                *jitter = -*jitter;
+        /* TODO: test various filters */
+        return pps_tf[0];
+}
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+        pps_tf[2] = pps_tf[1];
+        pps_tf[1] = pps_tf[0];
+        pps_tf[0] = err;
+}
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+        if (--pps_intcnt <= -PPS_INTCOUNT) {
+                pps_intcnt = -PPS_INTCOUNT;
+                if (pps_shift > PPS_INTMIN) {
+                        pps_shift--;
+                        pps_intcnt = 0;
+                }
+        }
+}
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+        if (++pps_intcnt >= PPS_INTCOUNT) {
+                pps_intcnt = PPS_INTCOUNT;
+                if (pps_shift < PPS_INTMAX) {
+                        pps_shift++;
+                        pps_intcnt = 0;
+                }
+        }
+}
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+        long delta, delta_mod;
+        s64 ftemp;
+        /* check if the frequency interval was too long */
+        if (freq_norm.sec > (2 << pps_shift)) {
+                time_status |= STA_PPSERROR;
+                pps_errcnt++;
+                pps_dec_freq_interval();
+                pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+                                freq_norm.sec);
+                return 0;
+        }
+        /* here the raw frequency offset and wander (stability) is
+         * calculated. If the wander is less than the wander threshold
+         * the interval is increased; otherwise it is decreased.
+         */
+        ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+                        freq_norm.sec);
+        delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+        pps_freq = ftemp;
+        if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+                pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+                time_status |= STA_PPSWANDER;
+                pps_stbcnt++;
+                pps_dec_freq_interval();
+        } else {        /* good sample */
+                pps_inc_freq_interval();
+        }
+        /* the stability metric is calculated as the average of recent
+         * frequency changes, but is used only for performance
+         * monitoring
+         */
+        delta_mod = delta;
+        if (delta_mod < 0)
+                delta_mod = -delta_mod;
+        pps_stabil += (div_s64(((s64)delta_mod) <<
+                                (NTP_SCALE_SHIFT - SHIFT_USEC),
+                                NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+        /* if enabled, the system clock frequency is updated */
+        if ((time_status & STA_PPSFREQ) != 0 &&
+            (time_status & STA_FREQHOLD) == 0) {
+                time_freq = pps_freq;
+                ntp_update_frequency();
+        }
+        return delta;
+}
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+        long correction = -error;
+        long jitter;
+        /* add the sample to the median filter */
+        pps_phase_filter_add(correction);
+        correction = pps_phase_filter_get(&jitter);
+        /* Nominal jitter is due to PPS signal noise. If it exceeds the
+         * threshold, the sample is discarded; otherwise, if so enabled,
+         * the time offset is updated.
+         */
+        if (jitter > (pps_jitter << PPS_POPCORN)) {
+                pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+                       jitter, (pps_jitter << PPS_POPCORN));
+                time_status |= STA_PPSJITTER;
+                pps_jitcnt++;
+        } else if (time_status & STA_PPSTIME) {
+                /* correct the time using the phase offset */
+                time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+                                NTP_INTERVAL_FREQ);
+                /* cancel running adjtime() */
+                time_adjust = 0;
+        }
+        /* update jitter */
+        pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+        struct pps_normtime pts_norm, freq_norm;
+        unsigned long flags;
+        pts_norm = pps_normalize_ts(*phase_ts);
+        write_seqlock_irqsave(&xtime_lock, flags);
+        /* clear the error bits, they will be set again if needed */
+        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+        /* indicate signal presence */
+        time_status |= STA_PPSSIGNAL;
+        pps_valid = PPS_VALID;
+        /* when called for the first time,
+         * just start the frequency interval */
+        if (unlikely(pps_fbase.tv_sec == 0)) {
+                pps_fbase = *raw_ts;
+                write_sequnlock_irqrestore(&xtime_lock, flags);
+                return;
+        }
+        /* ok, now we have a base for frequency calculation */
+        freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+        /* check that the signal is in the range
+         * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+        if ((freq_norm.sec == 0) ||
+                        (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+                        (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+                time_status |= STA_PPSJITTER;
+                /* restart the frequency calibration interval */
+                pps_fbase = *raw_ts;
+                write_sequnlock_irqrestore(&xtime_lock, flags);
+                pr_err("hardpps: PPSJITTER: bad pulse\n");
+                return;
+        }
+        /* signal is ok */
+        /* check if the current frequency interval is finished */
+        if (freq_norm.sec >= (1 << pps_shift)) {
+                pps_calcnt++;
+                /* restart the frequency calibration interval */
+                pps_fbase = *raw_ts;
+                hardpps_update_freq(freq_norm);
+        }
+        hardpps_update_phase(pts_norm.nsec);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif  /* CONFIG_NTP_PPS */
 static int __init ntp_tick_adj_setup(char *str)
 {
        ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..c340ca658f37
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+static void delete_clock(struct kref *kref);
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+        struct posix_clock *clk = fp->private_data;
+        down_read(&clk->rwsem);
+        if (!clk->zombie)
+                return clk;
+        up_read(&clk->rwsem);
+        return NULL;
+}
+static void put_posix_clock(struct posix_clock *clk)
+{
+        up_read(&clk->rwsem);
+}
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -EINVAL;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.read)
+                err = clk->ops.read(clk, fp->f_flags, buf, count);
+        put_posix_clock(clk);
+        return err;
+}
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int result = 0;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.poll)
+                result = clk->ops.poll(clk, fp, wait);
+        put_posix_clock(clk);
+        return result;
+}
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = 0;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.fasync)
+                err = clk->ops.fasync(clk, fd, fp, on);
+        put_posix_clock(clk);
+        return err;
+}
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -ENODEV;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.mmap)
+                err = clk->ops.mmap(clk, vma);
+        put_posix_clock(clk);
+        return err;
+}
+static long posix_clock_ioctl(struct file *fp,
+                              unsigned int cmd, unsigned long arg)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -ENOTTY;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.ioctl)
+                err = clk->ops.ioctl(clk, cmd, arg);
+        put_posix_clock(clk);
+        return err;
+}
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+                                     unsigned int cmd, unsigned long arg)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -ENOTTY;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.ioctl)
+                err = clk->ops.ioctl(clk, cmd, arg);
+        put_posix_clock(clk);
+        return err;
+}
+#endif
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+        int err;
+        struct posix_clock *clk =
+                container_of(inode->i_cdev, struct posix_clock, cdev);
+        down_read(&clk->rwsem);
+        if (clk->zombie) {
+                err = -ENODEV;
+                goto out;
+        }
+        if (clk->ops.open)
+                err = clk->ops.open(clk, fp->f_mode);
+        else
+                err = 0;
+        if (!err) {
+                kref_get(&clk->kref);
+                fp->private_data = clk;
+        }
+out:
+        up_read(&clk->rwsem);
+        return err;
+}
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+        struct posix_clock *clk = fp->private_data;
+        int err = 0;
+        if (clk->ops.release)
+                err = clk->ops.release(clk);
+        kref_put(&clk->kref, delete_clock);
+        fp->private_data = NULL;
+        return err;
+}
+static const struct file_operations posix_clock_file_operations = {
+        .owner          = THIS_MODULE,
+        .llseek         = no_llseek,
+        .read           = posix_clock_read,
+        .poll           = posix_clock_poll,
+        .unlocked_ioctl = posix_clock_ioctl,
+        .open           = posix_clock_open,
+        .release        = posix_clock_release,
+        .fasync         = posix_clock_fasync,
+        .mmap           = posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = posix_clock_compat_ioctl,
+#endif
+};
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+        int err;
+        kref_init(&clk->kref);
+        init_rwsem(&clk->rwsem);
+        cdev_init(&clk->cdev, &posix_clock_file_operations);
+        clk->cdev.owner = clk->ops.owner;
+        err = cdev_add(&clk->cdev, devid, 1);
+        return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+static void delete_clock(struct kref *kref)
+{
+        struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+        if (clk->release)
+                clk->release(clk);
+}
+void posix_clock_unregister(struct posix_clock *clk)
+{
+        cdev_del(&clk->cdev);
+        down_write(&clk->rwsem);
+        clk->zombie = true;
+        up_write(&clk->rwsem);
+        kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+struct posix_clock_desc {
+        struct file *fp;
+        struct posix_clock *clk;
+};
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+        struct file *fp = fget(CLOCKID_TO_FD(id));
+        int err = -EINVAL;
+        if (!fp)
+                return err;
+        if (fp->f_op->open != posix_clock_open || !fp->private_data)
+                goto out;
+        cd->fp = fp;
+        cd->clk = get_posix_clock(fp);
+        err = cd->clk ? 0 : -ENODEV;
+out:
+        if (err)
+                fput(fp);
+        return err;
+}
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+        put_posix_clock(cd->clk);
+        fput(cd->fp);
+}
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+                err = -EACCES;
+                goto out;
+        }
+        if (cd.clk->ops.clock_adjtime)
+                err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+        else
+                err = -EOPNOTSUPP;
+out:
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.clock_gettime)
+                err = cd.clk->ops.clock_gettime(cd.clk, ts);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.clock_getres)
+                err = cd.clk->ops.clock_getres(cd.clk, ts);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+                err = -EACCES;
+                goto out;
+        }
+        if (cd.clk->ops.clock_settime)
+                err = cd.clk->ops.clock_settime(cd.clk, ts);
+        else
+                err = -EOPNOTSUPP;
+out:
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_timer_create(struct k_itimer *kit)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.timer_create)
+                err = cd.clk->ops.timer_create(cd.clk, kit);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_timer_delete(struct k_itimer *kit)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.timer_delete)
+                err = cd.clk->ops.timer_delete(cd.clk, kit);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        if (get_clock_desc(id, &cd))
+                return;
+        if (cd.clk->ops.timer_gettime)
+                cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+        put_clock_desc(&cd);
+}
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+                            struct itimerspec *ts, struct itimerspec *old)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.timer_settime)
+                err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+struct k_clock clock_posix_dynamic = {
+        .clock_getres   = pc_clock_getres,
+        .clock_set      = pc_clock_settime,
+        .clock_get      = pc_clock_gettime,
+        .clock_adj      = pc_clock_adjtime,
+        .timer_create   = pc_timer_create,
+        .timer_set      = pc_timer_settime,
+        .timer_del      = pc_timer_delete,
+        .timer_get      = pc_timer_gettime,
+};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include "tick-internal.h"
@@ -457,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        unsigned long flags;
        int cpu;
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Periodic mode does not care about the enter/exit of power
         * states
         */
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-                goto out;
+                return;
-        bc = tick_broadcast_device.evtdev;
+        /*
+         * We are called with preemtion disabled from the depth of the
+         * idle code, so we can't be moved away.
+         */
        cpu = smp_processor_id();
        td = &per_cpu(tick_cpu_device, cpu);
        dev = td->evtdev;
        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-                goto out;
+                return;
+        bc = tick_broadcast_device.evtdev;
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
                if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
                        cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -490,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
                                tick_program_event(dev->next_event, 1);
                }
        }
-out:
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -523,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
 */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
+        int cpu = smp_processor_id();
        /* Set it up only once ! */
        if (bc->event_handler != tick_handle_oneshot_broadcast) {
                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
-                int cpu = smp_processor_id();
                bc->event_handler = tick_handle_oneshot_broadcast;
                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -552,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                        tick_broadcast_set_event(tick_next_period, 1);
                } else
                        bc->next_event.tv64 = KTIME_MAX;
+        } else {
+                /*
+                 * The first cpu which switches to oneshot mode sets
+                 * the bit for all other cpus which are in the general
+                 * (periodic) broadcast mask. So the bit is set and
+                 * would prevent the first broadcast enter after this
+                 * to program the bc device.
+                 */
+                tick_broadcast_clear_oneshot(cpu);
        }
 }
@@ -600,4 +611,14 @@ int tick_broadcast_oneshot_active(void)
        return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
 }
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+        struct clock_event_device *bc = tick_broadcast_device.evtdev;
+        return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include <asm/irq_regs.h>
@@ -49,9 +48,13 @@ struct tick_device *tick_get_device(int cpu)
 */
 int tick_is_oneshot_available(void)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+        if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                return 0;
+        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+                return 1;
+        return tick_broadcast_oneshot_available();
 }
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
 /*
 * tick internal variable and functions used by low/high res code
 */
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 #define TICK_DO_TIMER_NONE      -1
 #define TICK_DO_TIMER_BOOT      -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
 # endif /* !BROADCAST */
 #else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
        return 0;
 }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
 #endif /* !TICK_ONESHOT */
 /*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 {
        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
+#endif
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include "tick-internal.h"
@@ -95,7 +94,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 */
 int tick_program_event(ktime_t expires, int force)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        return tick_dev_program_event(dev, expires, force);
 }
@@ -167,7 +166,7 @@ int tick_oneshot_mode_active(void)
        int ret;
        local_irq_save(flags);
-        ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+        ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
        local_irq_restore(flags);
        return ret;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index bb2d8b7850a3..0c0e02f1b819 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include <linux/module.h>
 #include <asm/irq_regs.h>
@@ -642,8 +641,7 @@ static void tick_nohz_switch_to_nohz(void)
        }
        local_irq_enable();
-        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
+        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
-               smp_processor_id());
 }
 /*
@@ -842,8 +840,10 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ
-        if (tick_nohz_enabled)
+        if (tick_nohz_enabled) {
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+                printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/kernel.h>
 /*
 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
        int index;
        int num_samples = sync->num_samples;
-        if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+        if (num_samples > ARRAY_SIZE(buffer)) {
                samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
                if (!samples) {
                        samples = buffer;
-                        num_samples = sizeof(buffer)/sizeof(buffer[0]);
+                        num_samples = ARRAY_SIZE(buffer);
                }
        } else {
                samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
 #include <linux/time.h>
@@ -32,6 +32,8 @@ struct timekeeper {
        cycle_t cycle_interval;
        /* Number of clock shifted nano seconds in one NTP interval. */
        u64     xtime_interval;
+        /* shifted nano seconds left over when rounding cycle_interval */
+        s64     xtime_remainder;
        /* Raw nano seconds accumulated per NTP interval. */
        u32     raw_interval;
@@ -47,7 +49,7 @@ struct timekeeper {
        u32     mult;
 };
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
 /**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
 static void timekeeper_setup_internals(struct clocksource *clock)
 {
        cycle_t interval;
-        u64 tmp;
+        u64 tmp, ntpinterval;
        timekeeper.clock = clock;
        clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
+        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Go back from cycles -> shifted ns */
        timekeeper.xtime_interval = (u64) interval * clock->mult;
+        timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
        timekeeper.raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time;
 /*
 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
 */
-struct timespec raw_time;
+static struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
+#ifdef CONFIG_NTP_PPS
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw:     pointer to the timespec to be set to raw monotonic time
+ * @ts_real:    pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+        unsigned long seq;
+        s64 nsecs_raw, nsecs_real;
+        WARN_ON_ONCE(timekeeping_suspended);
+        do {
+                u32 arch_offset;
+                seq = read_seqbegin(&xtime_lock);
+                *ts_raw = raw_time;
+                *ts_real = xtime;
+                nsecs_raw = timekeeping_get_ns_raw();
+                nsecs_real = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                arch_offset = arch_gettimeoffset();
+                nsecs_raw += arch_offset;
+                nsecs_real += arch_offset;
+        } while (read_seqretry(&xtime_lock, seq));
+        timespec_add_ns(ts_raw, nsecs_raw);
+        timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+#endif /* CONFIG_NTP_PPS */
 /**
 * do_gettimeofday - Returns the time of day in a timeval
 * @tv:         pointer to the timeval to be set
@@ -306,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
 {
        struct timespec ts_delta;
        unsigned long flags;
@@ -340,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
 EXPORT_SYMBOL(do_settimeofday);
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv:         pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+        unsigned long flags;
+        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+                return -EINVAL;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_forward_now();
+        xtime = timespec_add(xtime, *ts);
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+        timekeeper.ntp_error = 0;
+        ntp_clear();
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        /* signal hrtimers about time change */
+        clock_was_set();
+        return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
 /**
 * change_clocksource - Swaps clocksources if a new one is available
 *
@@ -513,14 +596,65 @@ void __init timekeeping_init(void)
 static struct timespec timekeeping_suspend_time;
 /**
+ * __timekeeping_inject_sleeptime - Internal function to add sleep interval
+ * @delta: pointer to a timespec delta value
+ *
+ * Takes a timespec offset measuring a suspend interval and properly
+ * adds the sleep offset to the timekeeping variables.
+ */
+static void __timekeeping_inject_sleeptime(struct timespec *delta)
+{
+        xtime = timespec_add(xtime, *delta);
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
+        total_sleep_time = timespec_add(total_sleep_time, *delta);
+}
+/**
+ * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec delta value
+ *
+ * This hook is for architectures that cannot support read_persistent_clock
+ * because their RTC/persistent clock is only accessible when irqs are enabled.
+ *
+ * This function should only be called by rtc_resume(), and allows
+ * a suspend offset to be injected into the timekeeping values.
+ */
+void timekeeping_inject_sleeptime(struct timespec *delta)
+{
+        unsigned long flags;
+        struct timespec ts;
+        /* Make sure we don't set the clock twice */
+        read_persistent_clock(&ts);
+        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+                return;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_forward_now();
+        __timekeeping_inject_sleeptime(delta);
+        timekeeper.ntp_error = 0;
+        ntp_clear();
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        /* signal hrtimers about time change */
+        clock_was_set();
+}
+/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev:        unused
 *
 * This is for the generic clocksource timekeeping.
 * xtime/wall_to_monotonic/jiffies/etc are
 * still managed by arch specific suspend/resume code.
 */
-static int timekeeping_resume(struct sys_device *dev)
+static void timekeeping_resume(void)
 {
        unsigned long flags;
        struct timespec ts;
@@ -533,9 +667,7 @@ static int timekeeping_resume(struct sys_device *dev)
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                xtime = timespec_add(xtime, ts);
+                __timekeeping_inject_sleeptime(&ts);
-                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-                total_sleep_time = timespec_add(total_sleep_time, ts);
        }
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -548,12 +680,10 @@ static int timekeeping_resume(struct sys_device *dev)
        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
        /* Resume hrtimers */
-        hres_timers_resume();
+        hrtimers_resume();
-        return 0;
 }
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+static int timekeeping_suspend(void)
 {
        unsigned long flags;
@@ -571,26 +701,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 }
 /* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
+static struct syscore_ops timekeeping_syscore_ops = {
-        .name           = "timekeeping",
        .resume         = timekeeping_resume,
        .suspend        = timekeeping_suspend,
 };
-static struct sys_device device_timer = {
+static int __init timekeeping_init_ops(void)
-        .id             = 0,
-        .cls            = &timekeeping_sysclass,
-};
-static int __init timekeeping_init_device(void)
 {
-        int error = sysdev_class_register(&timekeeping_sysclass);
+        register_syscore_ops(&timekeeping_syscore_ops);
-        if (!error)
+        return 0;
-                error = sysdev_register(&device_timer);
-        return error;
 }
-device_initcall(timekeeping_init_device);
+device_initcall(timekeeping_init_ops);
 /*
 * If the error is already larger, we look ahead even further
@@ -719,7 +841,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        /* Accumulate error between NTP and clock interval */
        timekeeper.ntp_error += tick_length << shift;
-        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+        timekeeper.ntp_error -=
+            (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
                                (timekeeper.ntp_error_shift + shift);
        return offset;
@@ -731,7 +854,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 *
 * Called from the timer interrupt, must hold a write on xtime_lock.
 */
-void update_wall_time(void)
+static void update_wall_time(void)
 {
        struct clocksource *clock;
        cycle_t offset;
@@ -823,7 +946,7 @@ void update_wall_time(void)
 * getboottime - Return the real time of system boot.
 * @ts:         pointer to the timespec to be set
 *
- * Returns the time of day in a timespec.
+ * Returns the wall-time of boot in a timespec.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
@@ -841,6 +964,55 @@ void getboottime(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(getboottime);
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+        struct timespec tomono, sleep;
+        unsigned int seq;
+        s64 nsecs;
+        WARN_ON(timekeeping_suspended);
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *ts = xtime;
+                tomono = wall_to_monotonic;
+                sleep = total_sleep_time;
+                nsecs = timekeeping_get_ns();
+        } while (read_seqretry(&xtime_lock, seq));
+        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+        struct timespec ts;
+        get_monotonic_boottime(&ts);
+        return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
 * @ts:         pointer to the timespec to be converted
@@ -862,11 +1034,6 @@ struct timespec __current_kernel_time(void)
        return xtime;
 }
-struct timespec __get_wall_to_monotonic(void)
-{
-        return wall_to_monotonic;
-}
 struct timespec current_kernel_time(void)
 {
        struct timespec now;
@@ -898,3 +1065,63 @@ struct timespec get_monotonic_coarse(void)
                                now.tv_nsec + mono.tv_nsec);
        return now;
 }
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+        jiffies_64 += ticks;
+        update_wall_time();
+        calc_global_load(ticks);
+}
+/**
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ *    and sleep offsets.
+ * @xtim:       pointer to timespec to be set with xtime
+ * @wtom:       pointer to timespec to be set with wall_to_monotonic
+ * @sleep:      pointer to timespec to be set with time in suspend
+ */
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+                                struct timespec *wtom, struct timespec *sleep)
+{
+        unsigned long seq;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *xtim = xtime;
+                *wtom = wall_to_monotonic;
+                *sleep = total_sleep_time;
+        } while (read_seqretry(&xtime_lock, seq));
+}
+/**
+ * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
+ */
+ktime_t ktime_get_monotonic_offset(void)
+{
+        unsigned long seq;
+        struct timespec wtom;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                wtom = wall_to_monotonic;
+        } while (read_seqretry(&xtime_lock, seq));
+        return timespec_to_ktime(wtom);
+}
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks:      number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+        write_seqlock(&xtime_lock);
+        do_timer(ticks);
+        write_sequnlock(&xtime_lock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
        char symname[KSYM_NAME_LEN];
        if (lookup_symbol_name((unsigned long)sym, symname) < 0)
-                SEQ_printf(m, "<%p>", sym);
+                SEQ_printf(m, "<%pK>", sym);
        else
                SEQ_printf(m, "%s", symname);
 }
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 {
        struct hrtimer *timer, tmp;
        unsigned long next = 0, i;
-        struct rb_node *curr;
+        struct timerqueue_node *curr;
        unsigned long flags;
 next_one:
        i = 0;
        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
-        curr = base->first;
+        curr = timerqueue_getnext(&base->active);
        /*
         * Crude but we have to do this O(N*N) thing, because
         * we have to unlock the base when printing:
         */
        while (curr && i < next) {
-                curr = rb_next(curr);
+                curr = timerqueue_iterate_next(curr);
                i++;
        }
        if (curr) {
-                timer = rb_entry(curr, struct hrtimer, node);
+                timer = container_of(curr, struct hrtimer, node);
                tmp = *timer;
                raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
@@ -112,7 +112,7 @@ next_one:
 static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
-        SEQ_printf(m, "  .base:       %p\n", base);
+        SEQ_printf(m, "  .base:       %pK\n", base);
        SEQ_printf(m, "  .index:      %d\n",
                        base->index);
        SEQ_printf(m, "  .resolution: %Lu nsecs\n",
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
                              unsigned int timer_flag)
 {
        /*
-         * It doesnt matter which lock we take:
+         * It doesn't matter which lock we take:
         */
        raw_spinlock_t *lock;
        struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG           (0x1)
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-        timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
+        timer->base = TBASE_MAKE_DEFERRED(timer->base);
-                                       TBASE_DEFERRABLE_FLAG));
 }
 static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
-static inline void set_running_timer(struct tvec_base *base,
-                                        struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
-        base->running_timer = timer;
-#endif
-}
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
@@ -426,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
 static struct debug_obj_descr timer_debug_descr;
+static void *timer_debug_hint(void *addr)
+{
+        return ((struct timer_list *) addr)->function;
+}
 /*
 * fixup_init is called when:
 * - an active object is initialized
@@ -499,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr timer_debug_descr = {
        .name           = "timer_list",
+        .debug_hint     = timer_debug_hint,
        .fixup_init     = timer_fixup_init,
        .fixup_activate = timer_fixup_activate,
        .fixup_free     = timer_fixup_free,
@@ -765,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
        unsigned long expires_limit, mask;
        int bit;
-        expires_limit = expires;
        if (timer->slack >= 0) {
                expires_limit = expires + timer->slack;
        } else {
-                unsigned long now = jiffies;
+                long delta = expires - jiffies;
+                if (delta < 256)
+                        return expires;
-                /* No slack, if already expired else auto slack 0.4% */
+                expires_limit = expires + delta / 256;
-                if (time_after(expires, now))
-                        expires_limit = expires + (expires - now)/256;
        }
        mask = expires ^ expires_limit;
        if (mask == 0)
@@ -811,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
+        expires = apply_slack(timer, expires);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -819,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
        if (timer_pending(timer) && timer->expires == expires)
                return 1;
-        expires = apply_slack(timer, expires);
        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -936,15 +919,12 @@ int del_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(del_timer);
-#ifdef CONFIG_SMP
 /**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer: timer do del
 *
 * This function tries to deactivate a timer. Upon successful (ret >= 0)
 * exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
@@ -973,6 +953,7 @@ out:
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
+#ifdef CONFIG_SMP
 /**
 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 * @timer: the timer to be deactivated
@@ -988,6 +969,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 * add_timer_on(). Upon exit the timer is not queued and the handler is
 * not running on any CPU.
 *
+ * Note: You must not hold locks that are held in interrupt context
+ *   while calling this function. Even if the lock has nothing to do
+ *   with the timer in question.  Here's why:
+ *
+ *    CPU0                             CPU1
+ *    ----                             ----
+ *                                   <SOFTIRQ>
+ *                                   call_timer_fn();
+ *                                     base->running_timer = mytimer;
+ *  spin_lock_irq(somelock);
+ *                                     <IRQ>
+ *                                        spin_lock(somelock);
+ *  del_timer_sync(mytimer);
+ *   while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
 * The function returns whether it has deactivated a pending timer or not.
 */
 int del_timer_sync(struct timer_list *timer)
@@ -995,12 +995,20 @@ int del_timer_sync(struct timer_list *timer)
 #ifdef CONFIG_LOCKDEP
        unsigned long flags;
+        /*
+         * If lockdep gives a backtrace here, please reference
+         * the synchronization rules above.
+         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
 #endif
+        /*
+         * don't use it in hardirq context, because it
+         * could lead to deadlock.
+         */
+        WARN_ON(in_irq());
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
@@ -1111,7 +1119,7 @@ static inline void __run_timers(struct tvec_base *base)
                        timer_stats_account_timer(timer);
-                        set_running_timer(base, timer);
+                        base->running_timer = timer;
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
@@ -1119,7 +1127,7 @@ static inline void __run_timers(struct tvec_base *base)
                        spin_lock_irq(&base->lock);
                }
        }
-        set_running_timer(base, NULL);
+        base->running_timer = NULL;
        spin_unlock_irq(&base->lock);
 }
@@ -1249,9 +1257,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
 */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        struct tvec_base *base = __this_cpu_read(tvec_bases);
        unsigned long expires;
+        /*
+         * Pretend that there is no timer pending if the cpu is offline.
+         * Possible pending timers will be migrated later to an active cpu.
+         */
+        if (cpu_is_offline(smp_processor_id()))
+                return now + NEXT_TIMER_MAX_DELTA;
        spin_lock(&base->lock);
        if (time_before_eq(base->next_timer, base->timer_jiffies))
                base->next_timer = __next_timer_interrupt(base);
@@ -1279,7 +1293,10 @@ void update_process_times(int user_tick)
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
        printk_tick();
-        perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+        if (in_irq())
+                irq_work_run();
+#endif
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
@@ -1289,7 +1306,7 @@ void update_process_times(int user_tick)
 */
 static void run_timer_softirq(struct softirq_action *h)
 {
-        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        struct tvec_base *base = __this_cpu_read(tvec_bases);
        hrtimer_run_pending();
@@ -1306,19 +1323,6 @@ void run_local_timers(void)
        raise_softirq(TIMER_SOFTIRQ);
 }
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
- */
-void do_timer(unsigned long ticks)
-{
-        jiffies_64 += ticks;
-        update_wall_time();
-        calc_global_load();
-}
 #ifdef __ARCH_WANT_SYS_ALARM
 /*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
        help
          See Documentation/trace/ftrace-design.txt
+config HAVE_C_RECORDMCOUNT
+        bool
+        help
+          C version of recordmcount available?
 config TRACER_MAX_TRACE
        bool
@@ -64,6 +69,21 @@ config EVENT_TRACING
        select CONTEXT_SWITCH_TRACER
        bool
+config EVENT_POWER_TRACING_DEPRECATED
+        depends on EVENT_TRACING
+        bool "Deprecated power event trace API, to be removed"
+        default y
+        help
+          Provides old power event types:
+          C-state/idle accounting events:
+          power:power_start
+          power:power_end
+          and old cpufreq accounting event:
+          power:power_frequency
+          This is for userspace compatibility
+          and will vanish after 5 kernel iterations,
+          namely 2.6.41.
 config CONTEXT_SWITCH_TRACER
        bool
@@ -121,7 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER
+        select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
@@ -255,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
-          /sys/kernel/debug/tracing/profile_annotated_branch
+          /sys/kernel/debug/tracing/trace_stat/branch_annotated
          Note: this will add a significant overhead; only turn this
          on if you need to profile the system's use of these macros.
@@ -268,7 +288,7 @@ config PROFILE_ALL_BRANCHES
          taken in the kernel is recorded whether it hit or miss.
          The results will be displayed in:
-          /sys/kernel/debug/tracing/profile_branch
+          /sys/kernel/debug/tracing/trace_stat/branch_all
          This option also enables the likely/unlikely profiler.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
@@ -139,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
                     !blk_tracer_enabled))
                return;
+        /*
+         * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
+         * message to the trace.
+         */
+        if (!(bt->act_mask & BLK_TC_NOTIFY))
+                return;
        local_irq_save(flags);
        buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
        va_start(args, fmt);
@@ -169,7 +175,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
                                 BLK_TC_ACT(BLK_TC_WRITE) };
-#define BLK_TC_HARDBARRIER      BLK_TC_BARRIER
 #define BLK_TC_RAHEAD           BLK_TC_AHEAD
 /* The ilog2() calls fall out because they're constant */
@@ -197,7 +202,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                return;
        what |= ddir_act[rw & WRITE];
-        what |= MASK_TC_BIT(rw, HARDBARRIER);
        what |= MASK_TC_BIT(rw, SYNC);
        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
@@ -326,6 +330,7 @@ static const struct file_operations blk_dropped_fops = {
        .owner =        THIS_MODULE,
        .open =         blk_dropped_open,
        .read =         blk_dropped_read,
+        .llseek =       default_llseek,
 };
 static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +370,7 @@ static const struct file_operations blk_msg_fops = {
        .owner =        THIS_MODULE,
        .open =         blk_msg_open,
        .write =        blk_msg_write,
+        .llseek =       noop_llseek,
 };
 /*
@@ -639,7 +645,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
        if (!q)
                return -ENXIO;
-        lock_kernel();
        mutex_lock(&bdev->bd_mutex);
        switch (cmd) {
@@ -667,7 +672,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
        }
        mutex_unlock(&bdev->bd_mutex);
-        unlock_kernel();
        return ret;
 }
@@ -699,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
 *
 **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-                                    u32 what)
+                             u32 what)
 {
        struct blk_trace *bt = q->blk_trace;
-        int rw = rq->cmd_flags & 0x03;
        if (likely(!bt))
                return;
-        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= REQ_DISCARD;
-        if (rq->cmd_flags & REQ_SECURE)
-                rw |= REQ_SECURE;
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                what |= BLK_TC_ACT(BLK_TC_PC);
-                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
                                what, rq->errors, rq->cmd_len, rq->cmd);
        } else  {
                what |= BLK_TC_ACT(BLK_TC_FS);
-                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-                                what, rq->errors, 0, NULL);
+                                rq->cmd_flags, what, rq->errors, 0, NULL);
        }
 }
@@ -761,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
 * @q:          queue the io is for
 * @bio:        the source bio
 * @what:       the action
+ * @error:      error, if any
 *
 * Description:
 *     Records an action against a bio. Will log the bio offset + size.
 *
 **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-                                     u32 what)
+                              u32 what, int error)
 {
        struct blk_trace *bt = q->blk_trace;
        if (likely(!bt))
                return;
+        if (!error && !bio_flagged(bio, BIO_UPTODATE))
+                error = EIO;
        __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-                        !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+                        error, 0, NULL);
 }
 static void blk_add_trace_bio_bounce(void *ignore,
                                     struct request_queue *q, struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 static void blk_add_trace_bio_complete(void *ignore,
-                                       struct request_queue *q, struct bio *bio)
+                                       struct request_queue *q, struct bio *bio,
+                                       int error)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 static void blk_add_trace_bio_backmerge(void *ignore,
                                        struct request_queue *q,
                                        struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 static void blk_add_trace_bio_frontmerge(void *ignore,
                                         struct request_queue *q,
                                         struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 static void blk_add_trace_bio_queue(void *ignore,
                                    struct request_queue *q, struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+        blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 static void blk_add_trace_getrq(void *ignore,
@@ -815,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
                                struct bio *bio, int rw)
 {
        if (bio)
-                blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+                blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
        else {
                struct blk_trace *bt = q->blk_trace;
@@ -830,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
                                  struct bio *bio, int rw)
 {
        if (bio)
-                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
        else {
                struct blk_trace *bt = q->blk_trace;
@@ -848,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+                                    unsigned int depth, bool explicit)
 {
        struct blk_trace *bt = q->blk_trace;
        if (bt) {
-                unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+                __be64 rpdu = cpu_to_be64(depth);
-                __be64 rpdu = cpu_to_be64(pdu);
+                u32 what;
-                __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+                if (explicit)
-                                sizeof(rpdu), &rpdu);
+                        what = BLK_TA_UNPLUG_IO;
-        }
+                else
-}
+                        what = BLK_TA_UNPLUG_TIMER;
-static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
-{
-        struct blk_trace *bt = q->blk_trace;
-        if (bt) {
-                unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-                __be64 rpdu = cpu_to_be64(pdu);
-                __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+                __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-                                sizeof(rpdu), &rpdu);
        }
 }
@@ -890,7 +884,7 @@ static void blk_add_trace_split(void *ignore,
 }
 /**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
 * @ignore:     trace callback data parameter (not used)
 * @q:          queue the io is for
 * @bio:        the source bio
@@ -902,9 +896,9 @@ static void blk_add_trace_split(void *ignore,
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
-static void blk_add_trace_remap(void *ignore,
+static void blk_add_trace_bio_remap(void *ignore,
-                                struct request_queue *q, struct bio *bio,
+                                    struct request_queue *q, struct bio *bio,
-                                dev_t dev, sector_t from)
+                                    dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -1013,13 +1007,11 @@ static void blk_register_tracepoints(void)
        WARN_ON(ret);
        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+        ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
-        WARN_ON(ret);
-        ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        WARN_ON(ret);
        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+        ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
@@ -1028,10 +1020,9 @@ static void blk_register_tracepoints(void)
 static void blk_unregister_tracepoints(void)
 {
        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-        unregister_trace_block_remap(blk_add_trace_remap, NULL);
+        unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        unregister_trace_block_split(blk_add_trace_split, NULL);
-        unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+        unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
-        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
        unregister_trace_block_plug(blk_add_trace_plug, NULL);
        unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1652,10 +1643,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
        struct block_device *bdev;
        ssize_t ret = -ENXIO;
-        lock_kernel();
        bdev = bdget(part_devt(p));
        if (bdev == NULL)
-                goto out_unlock_kernel;
+                goto out;
        q = blk_trace_get_queue(bdev);
        if (q == NULL)
@@ -1683,8 +1673,7 @@ out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
 out_bdput:
        bdput(bdev);
-out_unlock_kernel:
+out:
-        unlock_kernel();
        return ret;
 }
@@ -1714,11 +1703,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
        ret = -ENXIO;
-        lock_kernel();
        p = dev_to_part(dev);
        bdev = bdget(part_devt(p));
        if (bdev == NULL)
-                goto out_unlock_kernel;
+                goto out;
        q = blk_trace_get_queue(bdev);
        if (q == NULL)
@@ -1753,8 +1741,6 @@ out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
 out_bdput:
        bdput(bdev);
-out_unlock_kernel:
-        unlock_kernel();
 out:
        return ret ? ret : count;
 }
@@ -1813,8 +1799,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
-        if (rw & REQ_HARDBARRIER)
-                rwbs[i++] = 'B';
        if (rw & REQ_SYNC)
                rwbs[i++] = 'S';
        if (rw & REQ_META)
@@ -1825,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        rwbs[i] = '\0';
 }
-void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
-{
-        int rw = rq->cmd_flags & 0x03;
-        int bytes;
-        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= REQ_DISCARD;
-        if (rq->cmd_flags & REQ_SECURE)
-                rw |= REQ_SECURE;
-        bytes = blk_rq_bytes(rq);
-        blk_fill_rwbs(rwbs, rw, bytes);
-}
 #endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..908038f57440 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
 #include "trace_stat.h"
 #define FTRACE_WARN_ON(cond)                    \
-        do {                                    \
+        ({                                      \
-                if (WARN_ON(cond))              \
+                int ___r = cond;                \
+                if (WARN_ON(___r))              \
                        ftrace_kill();          \
-        } while (0)
+                ___r;                           \
+        })
 #define FTRACE_WARN_ON_ONCE(cond)               \
-        do {                                    \
+        ({                                      \
-                if (WARN_ON_ONCE(cond))         \
+                int ___r = cond;                \
+                if (WARN_ON_ONCE(___r))         \
                        ftrace_kill();          \
-        } while (0)
+                ___r;                           \
+        })
 /* hash bits for specific function selection */
 #define FTRACE_HASH_BITS 7
 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+#define FTRACE_HASH_DEFAULT_BITS 10
+#define FTRACE_HASH_MAX_BITS 12
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
@@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
        .func           = ftrace_stub,
 };
-static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+static struct ftrace_ops global_ops;
+static void
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
 /*
- * Traverse the ftrace_list, invoking all entries.  The reason that we
+ * Traverse the ftrace_global_list, invoking all entries.  The reason that we
 * can use rcu_dereference_raw() is that elements removed from this list
 * are simply leaked, so there is no need to interact with a grace-period
 * mechanism.  The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_list.
+ * concurrent insertions into the ftrace_global_list.
 *
 * Silly Alpha and silly pointer-speculation compiler optimizations!
 */
-static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_global_list_func(unsigned long ip,
+                                    unsigned long parent_ip)
 {
-        struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
+        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+                return;
+        trace_recursion_set(TRACE_GLOBAL_BIT);
+        op = rcu_dereference_raw(ftrace_global_list); /*see above*/
        while (op != &ftrace_list_end) {
                op->func(ip, parent_ip);
                op = rcu_dereference_raw(op->next); /*see above*/
        };
+        trace_recursion_clear(TRACE_GLOBAL_BIT);
 }
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 }
 #endif
-static int __register_ftrace_function(struct ftrace_ops *ops)
+static void update_global_ops(void)
 {
-        ops->next = ftrace_list;
+        ftrace_func_t func;
        /*
-         * We are entering ops into the ftrace_list but another
+         * If there's only one function registered, then call that
-         * CPU might be walking that list. We need to make sure
+         * function directly. Otherwise, we need to iterate over the
-         * the ops->next pointer is valid before another CPU sees
+         * registered callers.
-         * the ops pointer included into the ftrace_list.
         */
-        rcu_assign_pointer(ftrace_list, ops);
+        if (ftrace_global_list == &ftrace_list_end ||
+            ftrace_global_list->next == &ftrace_list_end)
+                func = ftrace_global_list->func;
+        else
+                func = ftrace_global_list_func;
-        if (ftrace_enabled) {
+        /* If we filter on pids, update to use the pid function */
-                ftrace_func_t func;
+        if (!list_empty(&ftrace_pids)) {
+                set_ftrace_pid_function(func);
+                func = ftrace_pid_func;
+        }
-                if (ops->next == &ftrace_list_end)
+        global_ops.func = func;
-                        func = ops->func;
+}
-                else
-                        func = ftrace_list_func;
-                if (!list_empty(&ftrace_pids)) {
+static void update_ftrace_function(void)
-                        set_ftrace_pid_function(func);
+{
-                        func = ftrace_pid_func;
+        ftrace_func_t func;
-                }
+        update_global_ops();
+        /*
+         * If we are at the end of the list and this ops is
+         * not dynamic, then have the mcount trampoline call
+         * the function directly
+         */
+        if (ftrace_ops_list == &ftrace_list_end ||
+            (ftrace_ops_list->next == &ftrace_list_end &&
+             !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
+                func = ftrace_ops_list->func;
+        else
+                func = ftrace_ops_list_func;
-                /*
-                 * For one func, simply call it directly.
-                 * For more than one func, call the chain.
-                 */
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-                ftrace_trace_function = func;
+        ftrace_trace_function = func;
 #else
-                __ftrace_trace_function = func;
+        __ftrace_trace_function = func;
-                ftrace_trace_function = ftrace_test_stop_func;
+        ftrace_trace_function = ftrace_test_stop_func;
 #endif
-        }
+}
-        return 0;
+static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+{
+        ops->next = *list;
+        /*
+         * We are entering ops into the list but another
+         * CPU might be walking that list. We need to make sure
+         * the ops->next pointer is valid before another CPU sees
+         * the ops pointer included into the list.
+         */
+        rcu_assign_pointer(*list, ops);
 }
-static int __unregister_ftrace_function(struct ftrace_ops *ops)
+static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
 {
        struct ftrace_ops **p;
@@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
         * If we are removing the last function, then simply point
         * to the ftrace_stub.
         */
-        if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+        if (*list == ops && ops->next == &ftrace_list_end) {
-                ftrace_trace_function = ftrace_stub;
+                *list = &ftrace_list_end;
-                ftrace_list = &ftrace_list_end;
                return 0;
        }
-        for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+        for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
                if (*p == ops)
                        break;
@@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
                return -1;
        *p = (*p)->next;
+        return 0;
+}
-        if (ftrace_enabled) {
+static int __register_ftrace_function(struct ftrace_ops *ops)
-                /* If we only have one func left, then call that directly */
+{
-                if (ftrace_list->next == &ftrace_list_end) {
+        if (ftrace_disabled)
-                        ftrace_func_t func = ftrace_list->func;
+                return -ENODEV;
-                        if (!list_empty(&ftrace_pids)) {
+        if (FTRACE_WARN_ON(ops == &global_ops))
-                                set_ftrace_pid_function(func);
+                return -EINVAL;
-                                func = ftrace_pid_func;
-                        }
+        if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+                return -EBUSY;
-                        ftrace_trace_function = func;
-#else
+        if (!core_kernel_data((unsigned long)ops))
-                        __ftrace_trace_function = func;
+                ops->flags |= FTRACE_OPS_FL_DYNAMIC;
-#endif
-                }
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-        }
+                int first = ftrace_global_list == &ftrace_list_end;
+                add_ftrace_ops(&ftrace_global_list, ops);
+                ops->flags |= FTRACE_OPS_FL_ENABLED;
+                if (first)
+                        add_ftrace_ops(&ftrace_ops_list, &global_ops);
+        } else
+                add_ftrace_ops(&ftrace_ops_list, ops);
+        if (ftrace_enabled)
+                update_ftrace_function();
        return 0;
 }
-static void ftrace_update_pid_func(void)
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
-        ftrace_func_t func;
+        int ret;
-        if (ftrace_trace_function == ftrace_stub)
+        if (ftrace_disabled)
-                return;
+                return -ENODEV;
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
-        func = ftrace_trace_function;
+                return -EBUSY;
-#else
-        func = __ftrace_trace_function;
-#endif
-        if (!list_empty(&ftrace_pids)) {
+        if (FTRACE_WARN_ON(ops == &global_ops))
-                set_ftrace_pid_function(func);
+                return -EINVAL;
-                func = ftrace_pid_func;
-        } else {
-                if (func == ftrace_pid_func)
-                        func = ftrace_pid_function;
-        }
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-        ftrace_trace_function = func;
+                ret = remove_ftrace_ops(&ftrace_global_list, ops);
-#else
+                if (!ret && ftrace_global_list == &ftrace_list_end)
-        __ftrace_trace_function = func;
+                        ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
-#endif
+                if (!ret)
+                        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+        } else
+                ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+        if (ret < 0)
+                return ret;
+        if (ftrace_enabled)
+                update_ftrace_function();
+        /*
+         * Dynamic ops may be freed, we must make sure that all
+         * callers are done before leaving this function.
+         */
+        if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+                synchronize_sched();
+        return 0;
+}
+static void ftrace_update_pid_func(void)
+{
+        /* Only do something if we are tracing something */
+        if (ftrace_trace_function == ftrace_stub)
+                return;
+        update_ftrace_function();
 }
 #ifdef CONFIG_FUNCTION_PROFILER
@@ -800,6 +870,7 @@ static const struct file_operations ftrace_profile_fops = {
        .open           = tracing_open_generic,
        .read           = ftrace_profile_read,
        .write          = ftrace_profile_write,
+        .llseek         = default_llseek,
 };
 /* used to initialize the real stat files */
@@ -884,13 +955,38 @@ enum {
        FTRACE_ENABLE_CALLS             = (1 << 0),
        FTRACE_DISABLE_CALLS            = (1 << 1),
        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
-        FTRACE_ENABLE_MCOUNT            = (1 << 3),
+        FTRACE_START_FUNC_RET           = (1 << 3),
-        FTRACE_DISABLE_MCOUNT           = (1 << 4),
+        FTRACE_STOP_FUNC_RET            = (1 << 4),
-        FTRACE_START_FUNC_RET           = (1 << 5),
+};
-        FTRACE_STOP_FUNC_RET            = (1 << 6),
+struct ftrace_func_entry {
+        struct hlist_node hlist;
+        unsigned long ip;
+};
+struct ftrace_hash {
+        unsigned long           size_bits;
+        struct hlist_head       *buckets;
+        unsigned long           count;
+        struct rcu_head         rcu;
+};
+/*
+ * We make these constant because no one should touch them,
+ * but they are used as the default "empty hash", to avoid allocating
+ * it all the time. These are in a read only section such that if
+ * anyone does try to modify it, it will cause an exception.
+ */
+static const struct hlist_head empty_buckets[1];
+static const struct ftrace_hash empty_hash = {
+        .buckets = (struct hlist_head *)empty_buckets,
 };
+#define EMPTY_HASH      ((struct ftrace_hash *)&empty_hash)
-static int ftrace_filtered;
+static struct ftrace_ops global_ops = {
+        .func                   = ftrace_stub,
+        .notrace_hash           = EMPTY_HASH,
+        .filter_hash            = EMPTY_HASH,
+};
 static struct dyn_ftrace *ftrace_new_addrs;
@@ -913,6 +1009,269 @@ static struct ftrace_page	*ftrace_pages;
 static struct dyn_ftrace *ftrace_free_records;
+static struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+        unsigned long key;
+        struct ftrace_func_entry *entry;
+        struct hlist_head *hhd;
+        struct hlist_node *n;
+        if (!hash->count)
+                return NULL;
+        if (hash->size_bits > 0)
+                key = hash_long(ip, hash->size_bits);
+        else
+                key = 0;
+        hhd = &hash->buckets[key];
+        hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+                if (entry->ip == ip)
+                        return entry;
+        }
+        return NULL;
+}
+static void __add_hash_entry(struct ftrace_hash *hash,
+                             struct ftrace_func_entry *entry)
+{
+        struct hlist_head *hhd;
+        unsigned long key;
+        if (hash->size_bits)
+                key = hash_long(entry->ip, hash->size_bits);
+        else
+                key = 0;
+        hhd = &hash->buckets[key];
+        hlist_add_head(&entry->hlist, hhd);
+        hash->count++;
+}
+static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+        struct ftrace_func_entry *entry;
+        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+        if (!entry)
+                return -ENOMEM;
+        entry->ip = ip;
+        __add_hash_entry(hash, entry);
+        return 0;
+}
+static void
+free_hash_entry(struct ftrace_hash *hash,
+                  struct ftrace_func_entry *entry)
+{
+        hlist_del(&entry->hlist);
+        kfree(entry);
+        hash->count--;
+}
+static void
+remove_hash_entry(struct ftrace_hash *hash,
+                  struct ftrace_func_entry *entry)
+{
+        hlist_del(&entry->hlist);
+        hash->count--;
+}
+static void ftrace_hash_clear(struct ftrace_hash *hash)
+{
+        struct hlist_head *hhd;
+        struct hlist_node *tp, *tn;
+        struct ftrace_func_entry *entry;
+        int size = 1 << hash->size_bits;
+        int i;
+        if (!hash->count)
+                return;
+        for (i = 0; i < size; i++) {
+                hhd = &hash->buckets[i];
+                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+                        free_hash_entry(hash, entry);
+        }
+        FTRACE_WARN_ON(hash->count);
+}
+static void free_ftrace_hash(struct ftrace_hash *hash)
+{
+        if (!hash || hash == EMPTY_HASH)
+                return;
+        ftrace_hash_clear(hash);
+        kfree(hash->buckets);
+        kfree(hash);
+}
+static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
+{
+        struct ftrace_hash *hash;
+        hash = container_of(rcu, struct ftrace_hash, rcu);
+        free_ftrace_hash(hash);
+}
+static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
+{
+        if (!hash || hash == EMPTY_HASH)
+                return;
+        call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
+}
+static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+{
+        struct ftrace_hash *hash;
+        int size;
+        hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+        if (!hash)
+                return NULL;
+        size = 1 << size_bits;
+        hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
+        if (!hash->buckets) {
+                kfree(hash);
+                return NULL;
+        }
+        hash->size_bits = size_bits;
+        return hash;
+}
+static struct ftrace_hash *
+alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
+{
+        struct ftrace_func_entry *entry;
+        struct ftrace_hash *new_hash;
+        struct hlist_node *tp;
+        int size;
+        int ret;
+        int i;
+        new_hash = alloc_ftrace_hash(size_bits);
+        if (!new_hash)
+                return NULL;
+        /* Empty hash? */
+        if (!hash || !hash->count)
+                return new_hash;
+        size = 1 << hash->size_bits;
+        for (i = 0; i < size; i++) {
+                hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+                        ret = add_hash_entry(new_hash, entry->ip);
+                        if (ret < 0)
+                                goto free_hash;
+                }
+        }
+        FTRACE_WARN_ON(new_hash->count != hash->count);
+        return new_hash;
+ free_hash:
+        free_ftrace_hash(new_hash);
+        return NULL;
+}
+static int
+ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+        struct ftrace_func_entry *entry;
+        struct hlist_node *tp, *tn;
+        struct hlist_head *hhd;
+        struct ftrace_hash *old_hash;
+        struct ftrace_hash *new_hash;
+        unsigned long key;
+        int size = src->count;
+        int bits = 0;
+        int i;
+        /*
+         * If the new source is empty, just free dst and assign it
+         * the empty_hash.
+         */
+        if (!src->count) {
+                free_ftrace_hash_rcu(*dst);
+                rcu_assign_pointer(*dst, EMPTY_HASH);
+                return 0;
+        }
+        /*
+         * Make the hash size about 1/2 the # found
+         */
+        for (size /= 2; size; size >>= 1)
+                bits++;
+        /* Don't allocate too much */
+        if (bits > FTRACE_HASH_MAX_BITS)
+                bits = FTRACE_HASH_MAX_BITS;
+        new_hash = alloc_ftrace_hash(bits);
+        if (!new_hash)
+                return -ENOMEM;
+        size = 1 << src->size_bits;
+        for (i = 0; i < size; i++) {
+                hhd = &src->buckets[i];
+                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+                        if (bits > 0)
+                                key = hash_long(entry->ip, bits);
+                        else
+                                key = 0;
+                        remove_hash_entry(src, entry);
+                        __add_hash_entry(new_hash, entry);
+                }
+        }
+        old_hash = *dst;
+        rcu_assign_pointer(*dst, new_hash);
+        free_ftrace_hash_rcu(old_hash);
+        return 0;
+}
+/*
+ * Test the hashes for this ops to see if we want to call
+ * the ops->func or not.
+ *
+ * It's a match if the ip is in the ops->filter_hash or
+ * the filter_hash does not exist or is empty,
+ *  AND
+ * the ip is not in the ops->notrace_hash.
+ *
+ * This needs to be called with preemption disabled as
+ * the hashes are freed with call_rcu_sched().
+ */
+static int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+{
+        struct ftrace_hash *filter_hash;
+        struct ftrace_hash *notrace_hash;
+        int ret;
+        filter_hash = rcu_dereference_raw(ops->filter_hash);
+        notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+        if ((!filter_hash || !filter_hash->count ||
+             ftrace_lookup_ip(filter_hash, ip)) &&
+            (!notrace_hash || !notrace_hash->count ||
+             !ftrace_lookup_ip(notrace_hash, ip)))
+                ret = 1;
+        else
+                ret = 0;
+        return ret;
+}
 /*
 * This is a double for. Do not use 'break' to break out of the loop,
 * you must use a goto.
@@ -927,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records;
                }                               \
        }
+static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
+                                     int filter_hash,
+                                     bool inc)
+{
+        struct ftrace_hash *hash;
+        struct ftrace_hash *other_hash;
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        int count = 0;
+        int all = 0;
+        /* Only update if the ops has been registered */
+        if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+                return;
+        /*
+         * In the filter_hash case:
+         *   If the count is zero, we update all records.
+         *   Otherwise we just update the items in the hash.
+         *
+         * In the notrace_hash case:
+         *   We enable the update in the hash.
+         *   As disabling notrace means enabling the tracing,
+         *   and enabling notrace means disabling, the inc variable
+         *   gets inversed.
+         */
+        if (filter_hash) {
+                hash = ops->filter_hash;
+                other_hash = ops->notrace_hash;
+                if (!hash || !hash->count)
+                        all = 1;
+        } else {
+                inc = !inc;
+                hash = ops->notrace_hash;
+                other_hash = ops->filter_hash;
+                /*
+                 * If the notrace hash has no items,
+                 * then there's nothing to do.
+                 */
+                if (hash && !hash->count)
+                        return;
+        }
+        do_for_each_ftrace_rec(pg, rec) {
+                int in_other_hash = 0;
+                int in_hash = 0;
+                int match = 0;
+                if (all) {
+                        /*
+                         * Only the filter_hash affects all records.
+                         * Update if the record is not in the notrace hash.
+                         */
+                        if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+                                match = 1;
+                } else {
+                        in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
+                        in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+                        /*
+                         *
+                         */
+                        if (filter_hash && in_hash && !in_other_hash)
+                                match = 1;
+                        else if (!filter_hash && in_hash &&
+                                 (in_other_hash || !other_hash->count))
+                                match = 1;
+                }
+                if (!match)
+                        continue;
+                if (inc) {
+                        rec->flags++;
+                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+                                return;
+                } else {
+                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+                                return;
+                        rec->flags--;
+                }
+                count++;
+                /* Shortcut, if we handled all records, we are done. */
+                if (!all && count == hash->count)
+                        return;
+        } while_for_each_ftrace_rec();
+}
+static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
+                                    int filter_hash)
+{
+        __ftrace_hash_rec_update(ops, filter_hash, 0);
+}
+static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
+                                   int filter_hash)
+{
+        __ftrace_hash_rec_update(ops, filter_hash, 1);
+}
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
        rec->freelist = ftrace_free_records;
@@ -1048,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        ftrace_addr = (unsigned long)FTRACE_ADDR;
        /*
-         * If this record is not to be traced or we want to disable it,
+         * If we are enabling tracing:
-         * then disable it.
         *
-         * If we want to enable it and filtering is off, then enable it.
+         *   If the record has a ref count, then we need to enable it
+         *   because someone is using it.
         *
-         * If we want to enable it and filtering is on, enable it only if
+         *   Otherwise we make sure its disabled.
-         * it's filtered
+         *
+         * If we are disabling tracing, then disable all records that
+         * are enabled.
         */
-        if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
+        if (enable && (rec->flags & ~FTRACE_FL_MASK))
-                if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
+                flag = FTRACE_FL_ENABLED;
-                        flag = FTRACE_FL_ENABLED;
-        }
        /* If the state of this record hasn't changed, then do nothing */
        if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1080,19 +1538,16 @@ static void ftrace_replace_code(int enable)
        struct ftrace_page *pg;
        int failed;
+        if (unlikely(ftrace_disabled))
+                return;
        do_for_each_ftrace_rec(pg, rec) {
-                /*
+                /* Skip over free records */
-                 * Skip over free records, records that have
+                if (rec->flags & FTRACE_FL_FREE)
-                 * failed and not converted.
-                 */
-                if (rec->flags & FTRACE_FL_FREE ||
-                    rec->flags & FTRACE_FL_FAILED ||
-                    !(rec->flags & FTRACE_FL_CONVERTED))
                        continue;
                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
-                        rec->flags |= FTRACE_FL_FAILED;
                        ftrace_bug(failed, rec->ip);
                        /* Stop processing */
                        return;
@@ -1108,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
        ip = rec->ip;
+        if (unlikely(ftrace_disabled))
+                return 0;
        ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
        if (ret) {
                ftrace_bug(ret, ip);
-                rec->flags |= FTRACE_FL_FAILED;
                return 0;
        }
        return 1;
@@ -1172,6 +1629,7 @@ static void ftrace_run_update_code(int command)
 static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
+static int global_start_up;
 static void ftrace_startup_enable(int command)
 {
@@ -1186,19 +1644,38 @@ static void ftrace_startup_enable(int command)
        ftrace_run_update_code(command);
 }
-static void ftrace_startup(int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
+        bool hash_enable = true;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
        ftrace_start_up++;
        command |= FTRACE_ENABLE_CALLS;
+        /* ops marked global share the filter hashes */
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+                ops = &global_ops;
+                /* Don't update hash if global is already set */
+                if (global_start_up)
+                        hash_enable = false;
+                global_start_up++;
+        }
+        ops->flags |= FTRACE_OPS_FL_ENABLED;
+        if (hash_enable)
+                ftrace_hash_rec_enable(ops, 1);
        ftrace_startup_enable(command);
+        return 0;
 }
-static void ftrace_shutdown(int command)
+static void ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
+        bool hash_disable = true;
        if (unlikely(ftrace_disabled))
                return;
@@ -1210,6 +1687,23 @@ static void ftrace_shutdown(int command)
         */
        WARN_ON_ONCE(ftrace_start_up < 0);
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+                ops = &global_ops;
+                global_start_up--;
+                WARN_ON_ONCE(global_start_up < 0);
+                /* Don't update hash if global still has users */
+                if (global_start_up) {
+                        WARN_ON_ONCE(!ftrace_start_up);
+                        hash_disable = false;
+                }
+        }
+        if (hash_disable)
+                ftrace_hash_rec_disable(ops, 1);
+        if (ops != &global_ops || !global_start_up)
+                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
        if (!ftrace_start_up)
                command |= FTRACE_DISABLE_CALLS;
@@ -1226,8 +1720,6 @@ static void ftrace_shutdown(int command)
 static void ftrace_startup_sysctl(void)
 {
-        int command = FTRACE_ENABLE_MCOUNT;
        if (unlikely(ftrace_disabled))
                return;
@@ -1235,23 +1727,17 @@ static void ftrace_startup_sysctl(void)
        saved_ftrace_func = NULL;
        /* ftrace_start_up is true if we want ftrace running */
        if (ftrace_start_up)
-                command |= FTRACE_ENABLE_CALLS;
+                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-        ftrace_run_update_code(command);
 }
 static void ftrace_shutdown_sysctl(void)
 {
-        int command = FTRACE_DISABLE_MCOUNT;
        if (unlikely(ftrace_disabled))
                return;
        /* ftrace_start_up is true if ftrace is running */
        if (ftrace_start_up)
-                command |= FTRACE_DISABLE_CALLS;
+                ftrace_run_update_code(FTRACE_DISABLE_CALLS);
-        ftrace_run_update_code(command);
 }
 static cycle_t          ftrace_update_time;
@@ -1277,15 +1763,15 @@ static int ftrace_update_code(struct module *mod)
                p->flags = 0L;
                /*
-                 * Do the initial record convertion from mcount jump
+                 * Do the initial record conversion from mcount jump
                 * to the NOP instructions.
                 */
                if (!ftrace_code_disable(mod, p)) {
                        ftrace_free_rec(p);
-                        continue;
+                        /* Game over */
+                        break;
                }
-                p->flags |= FTRACE_FL_CONVERTED;
                ftrace_update_cnt++;
                /*
@@ -1360,32 +1846,39 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 enum {
        FTRACE_ITER_FILTER      = (1 << 0),
        FTRACE_ITER_NOTRACE     = (1 << 1),
-        FTRACE_ITER_FAILURES    = (1 << 2),
+        FTRACE_ITER_PRINTALL    = (1 << 2),
-        FTRACE_ITER_PRINTALL    = (1 << 3),
+        FTRACE_ITER_HASH        = (1 << 3),
-        FTRACE_ITER_HASH        = (1 << 4),
+        FTRACE_ITER_ENABLED     = (1 << 4),
 };
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 struct ftrace_iterator {
-        struct ftrace_page      *pg;
+        loff_t                          pos;
-        int                     hidx;
+        loff_t                          func_pos;
-        int                     idx;
+        struct ftrace_page              *pg;
-        unsigned                flags;
+        struct dyn_ftrace               *func;
-        struct trace_parser     parser;
+        struct ftrace_func_probe        *probe;
+        struct trace_parser             parser;
+        struct ftrace_hash              *hash;
+        struct ftrace_ops               *ops;
+        int                             hidx;
+        int                             idx;
+        unsigned                        flags;
 };
 static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct hlist_node *hnd = v;
+        struct hlist_node *hnd = NULL;
        struct hlist_head *hhd;
-        WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
        (*pos)++;
+        iter->pos = *pos;
+        if (iter->probe)
+                hnd = &iter->probe->node;
 retry:
        if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
                return NULL;
@@ -1408,7 +1901,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
                }
        }
-        return hnd;
+        if (WARN_ON_ONCE(!hnd))
+                return NULL;
+        iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+        return iter;
 }
 static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1915,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
        void *p = NULL;
        loff_t l;
-        if (!(iter->flags & FTRACE_ITER_HASH))
+        if (iter->func_pos > *pos)
-                *pos = 0;
+                return NULL;
-        iter->flags |= FTRACE_ITER_HASH;
        iter->hidx = 0;
-        for (l = 0; l <= *pos; ) {
+        for (l = 0; l <= (*pos - iter->func_pos); ) {
-                p = t_hash_next(m, p, &l);
+                p = t_hash_next(m, &l);
                if (!p)
                        break;
        }
-        return p;
+        if (!p)
+                return NULL;
+        /* Only set this if we have an item */
+        iter->flags |= FTRACE_ITER_HASH;
+        return iter;
 }
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
        struct ftrace_func_probe *rec;
-        struct hlist_node *hnd = v;
-        rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+        rec = iter->probe;
+        if (WARN_ON_ONCE(!rec))
+                return -EIO;
        if (rec->ops->print)
                return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1454,15 +1958,20 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
+        struct ftrace_ops *ops = &global_ops;
        struct dyn_ftrace *rec = NULL;
+        if (unlikely(ftrace_disabled))
+                return NULL;
        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_next(m, v, pos);
+                return t_hash_next(m, pos);
        (*pos)++;
+        iter->pos = iter->func_pos = *pos;
        if (iter->flags & FTRACE_ITER_PRINTALL)
-                return NULL;
+                return t_hash_start(m, pos);
 retry:
        if (iter->idx >= iter->pg->index) {
@@ -1475,38 +1984,59 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                rec = &iter->pg->records[iter->idx++];
                if ((rec->flags & FTRACE_FL_FREE) ||
-                    (!(iter->flags & FTRACE_ITER_FAILURES) &&
-                     (rec->flags & FTRACE_FL_FAILED)) ||
-                    ((iter->flags & FTRACE_ITER_FAILURES) &&
-                     !(rec->flags & FTRACE_FL_FAILED)) ||
                    ((iter->flags & FTRACE_ITER_FILTER) &&
-                     !(rec->flags & FTRACE_FL_FILTER)) ||
+                     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
-                     !(rec->flags & FTRACE_FL_NOTRACE))) {
+                     !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+                    ((iter->flags & FTRACE_ITER_ENABLED) &&
+                     !(rec->flags & ~FTRACE_FL_MASK))) {
                        rec = NULL;
                        goto retry;
                }
        }
-        return rec;
+        if (!rec)
+                return t_hash_start(m, pos);
+        iter->func = rec;
+        return iter;
+}
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+        iter->pos = 0;
+        iter->func_pos = 0;
+        iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
+        struct ftrace_ops *ops = &global_ops;
        void *p = NULL;
        loff_t l;
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled))
+                return NULL;
+        /*
+         * If an lseek was done, then reset and start from beginning.
+         */
+        if (*pos < iter->pos)
+                reset_iter_read(iter);
        /*
         * For set_ftrace_filter reading, if we have the filter
         * off, we can short cut and just print out that all
         * functions are enabled.
         */
-        if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+        if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1518,6 +2048,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_start(m, pos);
+        /*
+         * Unfortunately, we need to restart at ftrace_pages_start
+         * every time we let go of the ftrace_mutex. This is because
+         * those pointers can change without the lock.
+         */
        iter->pg = ftrace_pages_start;
        iter->idx = 0;
        for (l = 0; l <= *pos; ) {
@@ -1526,10 +2061,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                        break;
        }
-        if (!p && iter->flags & FTRACE_ITER_FILTER)
+        if (!p) {
-                return t_hash_start(m, pos);
+                if (iter->flags & FTRACE_ITER_FILTER)
+                        return t_hash_start(m, pos);
-        return p;
+                return NULL;
+        }
+        return iter;
 }
 static void t_stop(struct seq_file *m, void *p)
@@ -1540,20 +2079,26 @@ static void t_stop(struct seq_file *m, void *p)
 static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_iterator *iter = m->private;
-        struct dyn_ftrace *rec = v;
+        struct dyn_ftrace *rec;
        if (iter->flags & FTRACE_ITER_HASH)
-                return t_hash_show(m, v);
+                return t_hash_show(m, iter);
        if (iter->flags & FTRACE_ITER_PRINTALL) {
                seq_printf(m, "#### all functions enabled ####\n");
                return 0;
        }
+        rec = iter->func;
        if (!rec)
                return 0;
-        seq_printf(m, "%ps\n", (void *)rec->ip);
+        seq_printf(m, "%ps", (void *)rec->ip);
+        if (iter->flags & FTRACE_ITER_ENABLED)
+                seq_printf(m, " (%ld)",
+                           rec->flags & ~FTRACE_FL_MASK);
+        seq_printf(m, "\n");
        return 0;
 }
@@ -1593,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 }
 static int
-ftrace_failures_open(struct inode *inode, struct file *file)
+ftrace_enabled_open(struct inode *inode, struct file *file)
 {
-        int ret;
-        struct seq_file *m;
        struct ftrace_iterator *iter;
+        int ret;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter)
+                return -ENOMEM;
+        iter->pg = ftrace_pages_start;
+        iter->flags = FTRACE_ITER_ENABLED;
-        ret = ftrace_avail_open(inode, file);
+        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
-                m = (struct seq_file *)file->private_data;
+                struct seq_file *m = file->private_data;
-                iter = (struct ftrace_iterator *)m->private;
-                iter->flags = FTRACE_ITER_FAILURES;
+                m->private = iter;
+        } else {
+                kfree(iter);
        }
        return ret;
 }
+static void ftrace_filter_reset(struct ftrace_hash *hash)
-static void ftrace_filter_reset(int enable)
 {
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
        mutex_lock(&ftrace_lock);
-        if (enable)
+        ftrace_hash_clear(hash);
-                ftrace_filtered = 0;
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
-                        continue;
-                rec->flags &= ~type;
-        } while_for_each_ftrace_rec();
        mutex_unlock(&ftrace_lock);
 }
 static int
-ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+ftrace_regex_open(struct ftrace_ops *ops, int flag,
+                  struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
+        struct ftrace_hash *hash;
        int ret = 0;
        if (unlikely(ftrace_disabled))
@@ -1645,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
                return -ENOMEM;
        }
+        if (flag & FTRACE_ITER_NOTRACE)
+                hash = ops->notrace_hash;
+        else
+                hash = ops->filter_hash;
+        iter->ops = ops;
+        iter->flags = flag;
+        if (file->f_mode & FMODE_WRITE) {
+                mutex_lock(&ftrace_lock);
+                iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+                mutex_unlock(&ftrace_lock);
+                if (!iter->hash) {
+                        trace_parser_put(&iter->parser);
+                        kfree(iter);
+                        return -ENOMEM;
+                }
+        }
        mutex_lock(&ftrace_regex_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC))
-                ftrace_filter_reset(enable);
+                ftrace_filter_reset(iter->hash);
        if (file->f_mode & FMODE_READ) {
                iter->pg = ftrace_pages_start;
-                iter->flags = enable ? FTRACE_ITER_FILTER :
-                        FTRACE_ITER_NOTRACE;
                ret = seq_open(file, &show_ftrace_seq_ops);
                if (!ret) {
                        struct seq_file *m = file->private_data;
                        m->private = iter;
                } else {
+                        /* Failed */
+                        free_ftrace_hash(iter->hash);
                        trace_parser_put(&iter->parser);
                        kfree(iter);
                }
@@ -1673,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-        return ftrace_regex_open(inode, file, 1);
+        return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
+                                 inode, file);
 }
 static int
 ftrace_notrace_open(struct inode *inode, struct file *file)
 {
-        return ftrace_regex_open(inode, file, 0);
+        return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
+                                 inode, file);
 }
 static loff_t
@@ -1724,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
 }
 static int
-ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
+{
+        struct ftrace_func_entry *entry;
+        int ret = 0;
+        entry = ftrace_lookup_ip(hash, rec->ip);
+        if (not) {
+                /* Do nothing if it doesn't exist */
+                if (!entry)
+                        return 0;
+                free_hash_entry(hash, entry);
+        } else {
+                /* Do nothing if it exists */
+                if (entry)
+                        return 0;
+                ret = add_hash_entry(hash, rec->ip);
+        }
+        return ret;
+}
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *mod,
+                    char *regex, int len, int type)
 {
        char str[KSYM_SYMBOL_LEN];
+        char *modname;
+        kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+        if (mod) {
+                /* module lookup requires matching the module */
+                if (!modname || strcmp(modname, mod))
+                        return 0;
+                /* blank search means to match all funcs in the mod */
+                if (!len)
+                        return 1;
+        }
-        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
        return ftrace_match(str, regex, len, type);
 }
-static int ftrace_match_records(char *buff, int len, int enable)
+static int
+match_records(struct ftrace_hash *hash, char *buff,
+              int len, char *mod, int not)
 {
-        unsigned int search_len;
+        unsigned search_len = 0;
        struct ftrace_page *pg;
        struct dyn_ftrace *rec;
-        unsigned long flag;
+        int type = MATCH_FULL;
-        char *search;
+        char *search = buff;
-        int type;
-        int not;
        int found = 0;
+        int ret;
-        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+        if (len) {
-        type = filter_parse_regex(buff, len, &search, &not);
+                type = filter_parse_regex(buff, len, &search, &not);
+                search_len = strlen(search);
-        search_len = strlen(search);
+        }
        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
+        if (unlikely(ftrace_disabled))
-                        continue;
+                goto out_unlock;
-                if (ftrace_match_record(rec, search, search_len, type)) {
+        do_for_each_ftrace_rec(pg, rec) {
-                        if (not)
-                                rec->flags &= ~flag;
+                if (ftrace_match_record(rec, mod, search, search_len, type)) {
-                        else
+                        ret = enter_record(hash, rec, not);
-                                rec->flags |= flag;
+                        if (ret < 0) {
+                                found = ret;
+                                goto out_unlock;
+                        }
                        found = 1;
                }
-                /*
-                 * Only enable filtering if we have a function that
-                 * is filtered on.
-                 */
-                if (enable && (rec->flags & FTRACE_FL_FILTER))
-                        ftrace_filtered = 1;
        } while_for_each_ftrace_rec();
+ out_unlock:
        mutex_unlock(&ftrace_lock);
        return found;
 }
 static int
-ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
+ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
-                           char *regex, int len, int type)
 {
-        char str[KSYM_SYMBOL_LEN];
+        return match_records(hash, buff, len, NULL, 0);
-        char *modname;
-        kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
-        if (!modname || strcmp(modname, mod))
-                return 0;
-        /* blank search means to match all funcs in the mod */
-        if (len)
-                return ftrace_match(str, regex, len, type);
-        else
-                return 1;
 }
-static int ftrace_match_module_records(char *buff, char *mod, int enable)
+static int
+ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
 {
-        unsigned search_len = 0;
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        int type = MATCH_FULL;
-        char *search = buff;
-        unsigned long flag;
        int not = 0;
-        int found = 0;
-        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
        /* blank or '*' mean the same */
        if (strcmp(buff, "*") == 0)
@@ -1815,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
                not = 1;
        }
-        if (strlen(buff)) {
+        return match_records(hash, buff, strlen(buff), mod, not);
-                type = filter_parse_regex(buff, strlen(buff), &search, &not);
-                search_len = strlen(search);
-        }
-        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
-                        continue;
-                if (ftrace_match_module_record(rec, mod,
-                                               search, search_len, type)) {
-                        if (not)
-                                rec->flags &= ~flag;
-                        else
-                                rec->flags |= flag;
-                        found = 1;
-                }
-                if (enable && (rec->flags & FTRACE_FL_FILTER))
-                        ftrace_filtered = 1;
-        } while_for_each_ftrace_rec();
-        mutex_unlock(&ftrace_lock);
-        return found;
 }
 /*
@@ -1851,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
 static int
 ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
 {
+        struct ftrace_ops *ops = &global_ops;
+        struct ftrace_hash *hash;
        char *mod;
+        int ret = -EINVAL;
        /*
         * cmd == 'mod' because we only registered this func
@@ -1863,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
        /* we must have a module name */
        if (!param)
-                return -EINVAL;
+                return ret;
        mod = strsep(&param, ":");
        if (!strlen(mod))
-                return -EINVAL;
+                return ret;
-        if (ftrace_match_module_records(func, mod, enable))
+        if (enable)
-                return 0;
+                hash = ops->filter_hash;
-        return -EINVAL;
+        else
+                hash = ops->notrace_hash;
+        ret = ftrace_match_module_records(hash, func, mod);
+        if (!ret)
+                ret = -EINVAL;
+        if (ret < 0)
+                return ret;
+        return 0;
 }
 static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1922,6 +2492,7 @@ static int ftrace_probe_registered;
 static void __enable_ftrace_function_probe(void)
 {
+        int ret;
        int i;
        if (ftrace_probe_registered)
@@ -1936,13 +2507,16 @@ static void __enable_ftrace_function_probe(void)
        if (i == FTRACE_FUNC_HASHSIZE)
                return;
-        __register_ftrace_function(&trace_probe_ops);
+        ret = __register_ftrace_function(&trace_probe_ops);
-        ftrace_startup(0);
+        if (!ret)
+                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
 static void __disable_ftrace_function_probe(void)
 {
+        int ret;
        int i;
        if (!ftrace_probe_registered)
@@ -1955,8 +2529,10 @@ static void __disable_ftrace_function_probe(void)
        }
        /* no more funcs left */
-        __unregister_ftrace_function(&trace_probe_ops);
+        ret = __unregister_ftrace_function(&trace_probe_ops);
-        ftrace_shutdown(0);
+        if (!ret)
+                ftrace_shutdown(&trace_probe_ops, 0);
        ftrace_probe_registered = 0;
 }
@@ -1992,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                return -EINVAL;
        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
+        if (unlikely(ftrace_disabled))
-                        continue;
+                goto out_unlock;
+        do_for_each_ftrace_rec(pg, rec) {
-                if (!ftrace_match_record(rec, search, len, type))
+                if (!ftrace_match_record(rec, NULL, search, len, type))
                        continue;
                entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2158,7 +2735,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
        return ret;
 }
-static int ftrace_process_regex(char *buff, int len, int enable)
+static int ftrace_process_regex(struct ftrace_hash *hash,
+                                char *buff, int len, int enable)
 {
        char *func, *command, *next = buff;
        struct ftrace_func_command *p;
@@ -2167,9 +2745,12 @@ static int ftrace_process_regex(char *buff, int len, int enable)
        func = strsep(&next, ":");
        if (!next) {
-                if (ftrace_match_records(func, len, enable))
+                ret = ftrace_match_records(hash, func, len);
-                        return 0;
+                if (!ret)
-                return ret;
+                        ret = -EINVAL;
+                if (ret < 0)
+                        return ret;
+                return 0;
        }
        /* command found */
@@ -2202,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        mutex_lock(&ftrace_regex_lock);
+        ret = -ENODEV;
+        if (unlikely(ftrace_disabled))
+                goto out_unlock;
        if (file->f_mode & FMODE_READ) {
                struct seq_file *m = file->private_data;
                iter = m->private;
@@ -2213,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        if (read >= 0 && trace_parser_loaded(parser) &&
            !trace_parser_cont(parser)) {
-                ret = ftrace_process_regex(parser->buffer,
+                ret = ftrace_process_regex(iter->hash, parser->buffer,
                                           parser->idx, enable);
                trace_parser_clear(parser);
                if (ret)
@@ -2241,22 +2826,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
        return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
 }
-static void
+static int
-ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+                 int reset, int enable)
 {
+        struct ftrace_hash **orig_hash;
+        struct ftrace_hash *hash;
+        int ret;
+        /* All global ops uses the global ops filters */
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL)
+                ops = &global_ops;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
+        if (enable)
+                orig_hash = &ops->filter_hash;
+        else
+                orig_hash = &ops->notrace_hash;
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        if (!hash)
+                return -ENOMEM;
        mutex_lock(&ftrace_regex_lock);
        if (reset)
-                ftrace_filter_reset(enable);
+                ftrace_filter_reset(hash);
        if (buf)
-                ftrace_match_records(buf, len, enable);
+                ftrace_match_records(hash, buf, len);
+        mutex_lock(&ftrace_lock);
+        ret = ftrace_hash_move(orig_hash, hash);
+        mutex_unlock(&ftrace_lock);
        mutex_unlock(&ftrace_regex_lock);
+        free_ftrace_hash(hash);
+        return ret;
 }
 /**
 * ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
 * @buf - the string that holds the function filter text.
 * @len - the length of the string.
 * @reset - non zero to reset all filters before applying this filter.
@@ -2264,13 +2876,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
 * Filters denote which functions should be enabled when tracing is enabled.
 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
 */
-void ftrace_set_filter(unsigned char *buf, int len, int reset)
+void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+                       int len, int reset)
 {
-        ftrace_set_regex(buf, len, reset, 1);
+        ftrace_set_regex(ops, buf, len, reset, 1);
 }
+EXPORT_SYMBOL_GPL(ftrace_set_filter);
 /**
 * ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
 * @buf - the string that holds the function notrace text.
 * @len - the length of the string.
 * @reset - non zero to reset all filters before applying this filter.
@@ -2279,10 +2894,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
 * for tracing.
 */
-void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+                        int len, int reset)
 {
-        ftrace_set_regex(buf, len, reset, 0);
+        ftrace_set_regex(ops, buf, len, reset, 0);
 }
+EXPORT_SYMBOL_GPL(ftrace_set_notrace);
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
+{
+        ftrace_set_regex(&global_ops, buf, len, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
+{
+        ftrace_set_regex(&global_ops, buf, len, reset, 0);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
 /*
 * command line interface to allow users to set filters on boot up.
@@ -2333,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static void __init set_ftrace_early_filter(char *buf, int enable)
+static void __init
+set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
        char *func;
        while (buf) {
                func = strsep(&buf, ",");
-                ftrace_set_regex(func, strlen(func), 0, enable);
+                ftrace_set_regex(ops, func, strlen(func), 0, enable);
        }
 }
 static void __init set_ftrace_early_filters(void)
 {
        if (ftrace_filter_buf[0])
-                set_ftrace_early_filter(ftrace_filter_buf, 1);
+                set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
        if (ftrace_notrace_buf[0])
-                set_ftrace_early_filter(ftrace_notrace_buf, 0);
+                set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (ftrace_graph_buf[0])
                set_ftrace_early_graph(ftrace_graph_buf);
@@ -2356,11 +3006,14 @@ static void __init set_ftrace_early_filters(void)
 }
 static int
-ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
        struct ftrace_iterator *iter;
+        struct ftrace_hash **orig_hash;
        struct trace_parser *parser;
+        int filter_hash;
+        int ret;
        mutex_lock(&ftrace_regex_lock);
        if (file->f_mode & FMODE_READ) {
@@ -2373,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
        parser = &iter->parser;
        if (trace_parser_loaded(parser)) {
                parser->buffer[parser->idx] = 0;
-                ftrace_match_records(parser->buffer, parser->idx, enable);
+                ftrace_match_records(iter->hash, parser->buffer, parser->idx);
        }
-        mutex_lock(&ftrace_lock);
-        if (ftrace_start_up && ftrace_enabled)
-                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-        mutex_unlock(&ftrace_lock);
        trace_parser_put(parser);
+        if (file->f_mode & FMODE_WRITE) {
+                filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
+                if (filter_hash)
+                        orig_hash = &iter->ops->filter_hash;
+                else
+                        orig_hash = &iter->ops->notrace_hash;
+                mutex_lock(&ftrace_lock);
+                /*
+                 * Remove the current set, update the hash and add
+                 * them back.
+                 */
+                ftrace_hash_rec_disable(iter->ops, filter_hash);
+                ret = ftrace_hash_move(orig_hash, iter->hash);
+                if (!ret) {
+                        ftrace_hash_rec_enable(iter->ops, filter_hash);
+                        if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
+                            && ftrace_enabled)
+                                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+                }
+                mutex_unlock(&ftrace_lock);
+        }
+        free_ftrace_hash(iter->hash);
        kfree(iter);
        mutex_unlock(&ftrace_regex_lock);
        return 0;
 }
-static int
-ftrace_filter_release(struct inode *inode, struct file *file)
-{
-        return ftrace_regex_release(inode, file, 1);
-}
-static int
-ftrace_notrace_release(struct inode *inode, struct file *file)
-{
-        return ftrace_regex_release(inode, file, 0);
-}
 static const struct file_operations ftrace_avail_fops = {
        .open = ftrace_avail_open,
        .read = seq_read,
@@ -2407,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = {
        .release = seq_release_private,
 };
-static const struct file_operations ftrace_failures_fops = {
+static const struct file_operations ftrace_enabled_fops = {
-        .open = ftrace_failures_open,
+        .open = ftrace_enabled_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release_private,
@@ -2418,8 +3079,8 @@ static const struct file_operations ftrace_filter_fops = {
        .open = ftrace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = no_llseek,
+        .llseek = ftrace_regex_lseek,
-        .release = ftrace_filter_release,
+        .release = ftrace_regex_release,
 };
 static const struct file_operations ftrace_notrace_fops = {
@@ -2427,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = {
        .read = seq_read,
        .write = ftrace_notrace_write,
        .llseek = ftrace_regex_lseek,
-        .release = ftrace_notrace_release,
+        .release = ftrace_regex_release,
 };
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2536,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        bool exists;
        int i;
-        if (ftrace_disabled)
-                return -ENODEV;
        /* decode regex */
        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2547,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        search_len = strlen(search);
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled)) {
+                mutex_unlock(&ftrace_lock);
+                return -ENODEV;
+        }
        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+                if (rec->flags & FTRACE_FL_FREE)
                        continue;
-                if (ftrace_match_record(rec, search, search_len, type)) {
+                if (ftrace_match_record(rec, NULL, search, search_len, type)) {
                        /* if it is in the array */
                        exists = false;
                        for (i = 0; i < *idx; i++) {
@@ -2632,6 +3296,7 @@ static const struct file_operations ftrace_graph_fops = {
        .read           = seq_read,
        .write          = ftrace_graph_write,
        .release        = ftrace_graph_release,
+        .llseek         = seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -2641,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        trace_create_file("available_filter_functions", 0444,
                        d_tracer, NULL, &ftrace_avail_fops);
-        trace_create_file("failures", 0444,
+        trace_create_file("enabled_functions", 0444,
-                        d_tracer, NULL, &ftrace_failures_fops);
+                        d_tracer, NULL, &ftrace_enabled_fops);
        trace_create_file("set_ftrace_filter", 0644, d_tracer,
                        NULL, &ftrace_filter_fops);
@@ -2682,7 +3347,10 @@ static int ftrace_process_locs(struct module *mod,
                ftrace_record_ip(addr);
        }
-        /* disable interrupts to prevent kstop machine */
+        /*
+         * Disable interrupts to prevent interrupts from executing
+         * code that is being modified.
+         */
        local_irq_save(flags);
        ftrace_update_code(mod);
        local_irq_restore(flags);
@@ -2697,10 +3365,11 @@ void ftrace_release_mod(struct module *mod)
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
+        mutex_lock(&ftrace_lock);
        if (ftrace_disabled)
-                return;
+                goto out_unlock;
-        mutex_lock(&ftrace_lock);
        do_for_each_ftrace_rec(pg, rec) {
                if (within_module_core(rec->ip, mod)) {
                        /*
@@ -2711,6 +3380,7 @@ void ftrace_release_mod(struct module *mod)
                        ftrace_free_rec(rec);
                }
        } while_for_each_ftrace_rec();
+ out_unlock:
        mutex_unlock(&ftrace_lock);
 }
@@ -2797,6 +3467,10 @@ void __init ftrace_init(void)
 #else
+static struct ftrace_ops global_ops = {
+        .func                   = ftrace_stub,
+};
 static int __init ftrace_nodyn_init(void)
 {
        ftrace_enabled = 1;
@@ -2807,12 +3481,47 @@ device_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(command)        do { } while (0)
+# define ftrace_startup(ops, command)                   \
-# define ftrace_shutdown(command)       do { } while (0)
+        ({                                              \
+                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                0;                                      \
+        })
+# define ftrace_shutdown(ops, command)  do { } while (0)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
+static inline int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+{
+        return 1;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
+static void
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
+{
+        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+                return;
+        trace_recursion_set(TRACE_INTERNAL_BIT);
+        /*
+         * Some of the ops may be dynamically allocated,
+         * they must be freed after a synchronize_sched().
+         */
+        preempt_disable_notrace();
+        op = rcu_dereference_raw(ftrace_ops_list);
+        while (op != &ftrace_list_end) {
+                if (ftrace_ops_test(op, ip))
+                        op->func(ip, parent_ip);
+                op = rcu_dereference_raw(op->next);
+        };
+        preempt_enable_notrace();
+        trace_recursion_clear(TRACE_INTERNAL_BIT);
+}
 static void clear_ftrace_swapper(void)
 {
        struct task_struct *p;
@@ -3105,19 +3814,23 @@ void ftrace_kill(void)
 */
 int register_ftrace_function(struct ftrace_ops *ops)
 {
-        int ret;
+        int ret = -1;
-        if (unlikely(ftrace_disabled))
-                return -1;
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled))
+                goto out_unlock;
        ret = __register_ftrace_function(ops);
-        ftrace_startup(0);
+        if (!ret)
+                ret = ftrace_startup(ops, 0);
+ out_unlock:
        mutex_unlock(&ftrace_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_function);
 /**
 * unregister_ftrace_function - unregister a function for profiling.
@@ -3131,25 +3844,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
        ret = __unregister_ftrace_function(ops);
-        ftrace_shutdown(0);
+        if (!ret)
+                ftrace_shutdown(ops, 0);
        mutex_unlock(&ftrace_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_function);
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp,
                     loff_t *ppos)
 {
-        int ret;
+        int ret = -ENODEV;
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        mutex_lock(&ftrace_lock);
-        ret  = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (unlikely(ftrace_disabled))
+                goto out;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
                goto out;
@@ -3161,11 +3876,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
                ftrace_startup_sysctl();
                /* we are starting ftrace again */
-                if (ftrace_list != &ftrace_list_end) {
+                if (ftrace_ops_list != &ftrace_list_end) {
-                        if (ftrace_list->next == &ftrace_list_end)
+                        if (ftrace_ops_list->next == &ftrace_list_end)
-                                ftrace_trace_function = ftrace_list->func;
+                                ftrace_trace_function = ftrace_ops_list->func;
                        else
-                                ftrace_trace_function = ftrace_list_func;
+                                ftrace_trace_function = ftrace_ops_list_func;
                }
        } else {
@@ -3289,7 +4004,7 @@ static int start_graph_tracing(void)
        /* The cpu_boot init_task->ret_stack will never be freed */
        for_each_online_cpu(cpu) {
                if (!idle_task(cpu)->ret_stack)
-                        ftrace_graph_init_task(idle_task(cpu));
+                        ftrace_graph_init_idle_task(idle_task(cpu), cpu);
        }
        do {
@@ -3354,7 +4069,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_return = retfunc;
        ftrace_graph_entry = entryfunc;
-        ftrace_startup(FTRACE_START_FUNC_RET);
+        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -3371,7 +4086,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_active--;
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
@@ -3379,6 +4094,49 @@ void unregister_ftrace_graph(void)
        mutex_unlock(&ftrace_lock);
 }
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+        atomic_set(&t->tracing_graph_pause, 0);
+        atomic_set(&t->trace_overrun, 0);
+        t->ftrace_timestamp = 0;
+        /* make curr_ret_stack visible before we add the ret_stack */
+        smp_wmb();
+        t->ret_stack = ret_stack;
+}
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+        t->curr_ret_stack = -1;
+        /*
+         * The idle task has no parent, it either has its own
+         * stack or no stack at all.
+         */
+        if (t->ret_stack)
+                WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+        if (ftrace_graph_active) {
+                struct ftrace_ret_stack *ret_stack;
+                ret_stack = per_cpu(idle_ret_stack, cpu);
+                if (!ret_stack) {
+                        ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+                                            * sizeof(struct ftrace_ret_stack),
+                                            GFP_KERNEL);
+                        if (!ret_stack)
+                                return;
+                        per_cpu(idle_ret_stack, cpu) = ret_stack;
+                }
+                graph_init_task(t, ret_stack);
+        }
+}
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
@@ -3394,12 +4152,7 @@ void ftrace_graph_init_task(struct task_struct *t)
                                GFP_KERNEL);
                if (!ret_stack)
                        return;
-                atomic_set(&t->tracing_graph_pause, 0);
+                graph_init_task(t, ret_stack);
-                atomic_set(&t->trace_overrun, 0);
-                t->ftrace_timestamp = 0;
-                /* make curr_ret_stack visable before we add the ret_stack */
-                smp_wmb();
-                t->ret_stack = ret_stack;
        }
 }
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bca96377fd4e..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
 */
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
-#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -224,6 +223,9 @@ enum {
        RB_LEN_TIME_STAMP = 16,
 };
+#define skip_time_extend(event) \
+        ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +250,12 @@ rb_event_data_length(struct ring_buffer_event *event)
        return length + RB_EVNT_HDR_SIZE;
 }
-/* inline for ring buffer fast paths */
+/*
-static unsigned
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
        switch (event->type_len) {
@@ -274,13 +280,41 @@ rb_event_length(struct ring_buffer_event *event)
        return 0;
 }
+/*
+ * Return total length of time extend and data,
+ *   or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+        unsigned len = 0;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                /* time extends include the data event after it */
+                len = RB_LEN_TIME_EXTEND;
+                event = skip_time_extend(event);
+        }
+        return len + rb_event_length(event);
+}
 /**
 * ring_buffer_event_length - return the length of the event
 * @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
 */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-        unsigned length = rb_event_length(event);
+        unsigned length;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
+        length = rb_event_length(event);
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +328,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
        if (event->type_len)
@@ -404,9 +440,6 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE  (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
        struct buffer_data_page field;
@@ -635,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
 * the reader page). But if the next page is a header page,
 * its flags will be non zero.
 */
-static int inline
+static inline int
 rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                struct buffer_page *page, struct list_head *list)
 {
@@ -1395,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+        mutex_lock(&buffer->mutex);
+        if (val)
+                buffer->flags |= RB_FL_OVERWRITE;
+        else
+                buffer->flags &= ~RB_FL_OVERWRITE;
+        mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1434,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
        return local_read(&bpage->entries) & RB_WRITE_MASK;
 }
-/* Size is determined by what has been commited */
+/* Size is determined by what has been committed */
 static inline unsigned rb_page_size(struct buffer_page *bpage)
 {
        return rb_page_commit(bpage);
@@ -1546,6 +1590,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
        iter->head = 0;
 }
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+        event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+        /* Not the first event on the page? */
+        if (rb_event_index(event)) {
+                event->time_delta = delta & TS_MASK;
+                event->array[0] = delta >> TS_SHIFT;
+        } else {
+                /* nope, just zero it */
+                event->time_delta = 0;
+                event->array[0] = 0;
+        }
+        return skip_time_extend(event);
+}
 /**
 * ring_buffer_update_event - update event type and data
 * @event: the even to update
@@ -1558,28 +1621,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 * data field.
 */
 static void
-rb_update_event(struct ring_buffer_event *event,
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-                         unsigned type, unsigned length)
+                struct ring_buffer_event *event, unsigned length,
+                int add_timestamp, u64 delta)
 {
-        event->type_len = type;
+        /* Only a commit updates the timestamp */
+        if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-        switch (type) {
+                delta = 0;
-        case RINGBUF_TYPE_PADDING:
-        case RINGBUF_TYPE_TIME_EXTEND:
-        case RINGBUF_TYPE_TIME_STAMP:
-                break;
-        case 0:
+        /*
-                length -= RB_EVNT_HDR_SIZE;
+         * If we need to add a timestamp, then we
-                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+         * add it to the start of the resevered space.
-                        event->array[0] = length;
+         */
-                else
+        if (unlikely(add_timestamp)) {
-                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+                event = rb_add_time_stamp(event, delta);
-                break;
+                length -= RB_LEN_TIME_EXTEND;
-        default:
+                delta = 0;
-                BUG();
        }
+        event->time_delta = delta;
+        length -= RB_EVNT_HDR_SIZE;
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+                event->type_len = 0;
+                event->array[0] = length;
+        } else
+                event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 }
 /*
@@ -1823,10 +1889,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        local_sub(length, &tail_page->write);
 }
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-             struct buffer_page *tail_page, u64 *ts)
+             struct buffer_page *tail_page, u64 ts)
 {
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1978,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-                *ts = rb_time_stamp(buffer);
+                ts = rb_time_stamp(buffer);
-                next_page->page->time_stamp = *ts;
+                next_page->page->time_stamp = ts;
        }
 out_again:
@@ -1929,12 +1998,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                  unsigned type, unsigned long length, u64 *ts)
+                  unsigned long length, u64 ts,
+                  u64 delta, int add_timestamp)
 {
        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
+        /*
+         * If the time delta since the last event is too big to
+         * hold in the time field of the event, then we append a
+         * TIME EXTEND event ahead of the data event.
+         */
+        if (unlikely(add_timestamp))
+                length += RB_LEN_TIME_EXTEND;
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
@@ -1943,7 +2021,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        tail = write - length;
        /* See if we shot pass the end of this buffer page */
-        if (write > BUF_PAGE_SIZE)
+        if (unlikely(write > BUF_PAGE_SIZE))
                return rb_move_tail(cpu_buffer, length, tail,
                                    tail_page, ts);
@@ -1951,18 +2029,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-        rb_update_event(event, type, length);
+        rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
-        /* The passed in type is zero for DATA */
+        local_inc(&tail_page->entries);
-        if (likely(!type))
-                local_inc(&tail_page->entries);
        /*
         * If this is the first commit on the page, then update
         * its timestamp.
         */
        if (!tail)
-                tail_page->page->time_stamp = *ts;
+                tail_page->page->time_stamp = ts;
        return event;
 }
@@ -1977,7 +2053,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        unsigned long addr;
        new_index = rb_event_index(event);
-        old_index = new_index + rb_event_length(event);
+        old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
        addr &= PAGE_MASK;
@@ -2003,76 +2079,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 }
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                  u64 *ts, u64 *delta)
-{
-        struct ring_buffer_event *event;
-        int ret;
-        WARN_ONCE(*delta > (1ULL << 59),
-                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                  (unsigned long long)*delta,
-                  (unsigned long long)*ts,
-                  (unsigned long long)cpu_buffer->write_stamp);
-        /*
-         * The delta is too big, we to add a
-         * new timestamp.
-         */
-        event = __rb_reserve_next(cpu_buffer,
-                                  RINGBUF_TYPE_TIME_EXTEND,
-                                  RB_LEN_TIME_EXTEND,
-                                  ts);
-        if (!event)
-                return -EBUSY;
-        if (PTR_ERR(event) == -EAGAIN)
-                return -EAGAIN;
-        /* Only a commited time event can update the write stamp */
-        if (rb_event_is_commit(cpu_buffer, event)) {
-                /*
-                 * If this is the first on the page, then it was
-                 * updated with the page itself. Try to discard it
-                 * and if we can't just make it zero.
-                 */
-                if (rb_event_index(event)) {
-                        event->time_delta = *delta & TS_MASK;
-                        event->array[0] = *delta >> TS_SHIFT;
-                } else {
-                        /* try to discard, since we do not need this */
-                        if (!rb_try_to_discard(cpu_buffer, event)) {
-                                /* nope, just zero it */
-                                event->time_delta = 0;
-                                event->array[0] = 0;
-                        }
-                }
-                cpu_buffer->write_stamp = *ts;
-                /* let the caller know this was the commit */
-                ret = 1;
-        } else {
-                /* Try to discard the event */
-                if (!rb_try_to_discard(cpu_buffer, event)) {
-                        /* Darn, this is just wasted space */
-                        event->time_delta = 0;
-                        event->array[0] = 0;
-                }
-                ret = 0;
-        }
-        *delta = 0;
-        return ret;
-}
 static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        local_inc(&cpu_buffer->committing);
        local_inc(&cpu_buffer->commits);
 }
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        unsigned long commits;
@@ -2110,9 +2123,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                      unsigned long length)
 {
        struct ring_buffer_event *event;
-        u64 ts, delta = 0;
+        u64 ts, delta;
-        int commit = 0;
        int nr_loops = 0;
+        int add_timestamp;
+        u64 diff;
        rb_start_commit(cpu_buffer);
@@ -2133,6 +2147,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        length = rb_calculate_event_length(length);
 again:
+        add_timestamp = 0;
+        delta = 0;
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
@@ -2146,56 +2163,40 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                goto out_fail;
        ts = rb_time_stamp(cpu_buffer->buffer);
+        diff = ts - cpu_buffer->write_stamp;
-        /*
+        /* make sure this diff is calculated here */
-         * Only the first commit can update the timestamp.
+        barrier();
-         * Yes there is a race here. If an interrupt comes in
-         * just after the conditional and it traces too, then it
-         * will also check the deltas. More than one timestamp may
-         * also be made. But only the entry that did the actual
-         * commit will be something other than zero.
-         */
-        if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-                   rb_page_write(cpu_buffer->tail_page) ==
-                   rb_commit_index(cpu_buffer))) {
-                u64 diff;
-                diff = ts - cpu_buffer->write_stamp;
-                /* make sure this diff is calculated here */
-                barrier();
-                /* Did the write stamp get updated already? */
-                if (unlikely(ts < cpu_buffer->write_stamp))
-                        goto get_event;
+        /* Did the write stamp get updated already? */
+        if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
+                        int local_clock_stable = 1;
-                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-                        if (commit == -EBUSY)
+                        local_clock_stable = sched_clock_stable;
-                                goto out_fail;
+#endif
+                        WARN_ONCE(delta > (1ULL << 59),
-                        if (commit == -EAGAIN)
+                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-                                goto again;
+                                  (unsigned long long)delta,
+                                  (unsigned long long)ts,
-                        RB_WARN_ON(cpu_buffer, commit < 0);
+                                  (unsigned long long)cpu_buffer->write_stamp,
+                                  local_clock_stable ? "" :
+                                  "If you just came from a suspend/resume,\n"
+                                  "please switch to the trace global clock:\n"
+                                  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+                        add_timestamp = 1;
                }
        }
- get_event:
+        event = __rb_reserve_next(cpu_buffer, length, ts,
-        event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+                                  delta, add_timestamp);
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
        if (!event)
                goto out_fail;
-        if (!rb_event_is_commit(cpu_buffer, event))
-                delta = 0;
-        event->time_delta = delta;
        return event;
 out_fail:
@@ -2207,32 +2208,39 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 #define TRACE_RECURSIVE_DEPTH 16
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
 {
-        current->trace_recursion++;
-        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-                return 0;
        /* Disable all tracing before we do anything else */
        tracing_off_permanent();
        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-                    current->trace_recursion,
+                    trace_recursion_buffer(),
                    hardirq_count() >> HARDIRQ_SHIFT,
                    softirq_count() >> SOFTIRQ_SHIFT,
                    in_nmi());
        WARN_ON_ONCE(1);
+}
+static inline int trace_recursive_lock(void)
+{
+        trace_recursion_inc();
+        if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
+                return 0;
+        trace_recursive_fail();
        return -1;
 }
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
 {
-        WARN_ON_ONCE(!current->trace_recursion);
+        WARN_ON_ONCE(!trace_recursion_buffer());
-        current->trace_recursion--;
+        trace_recursion_dec();
 }
 #else
@@ -2308,12 +2316,28 @@ static void
 rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
 {
+        u64 delta;
        /*
         * The event first in the commit queue updates the
         * time stamp.
         */
-        if (rb_event_is_commit(cpu_buffer, event))
+        if (rb_event_is_commit(cpu_buffer, event)) {
-                cpu_buffer->write_stamp += event->time_delta;
+                /*
+                 * A commit event that is first on a page
+                 * updates the write timestamp with the page stamp
+                 */
+                if (!rb_event_index(event))
+                        cpu_buffer->write_stamp =
+                                cpu_buffer->commit_page->page->time_stamp;
+                else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                        delta = event->array[0];
+                        delta <<= TS_SHIFT;
+                        delta += event->time_delta;
+                        cpu_buffer->write_stamp += delta;
+                } else
+                        cpu_buffer->write_stamp += event->time_delta;
+        }
 }
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2377,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+                event = skip_time_extend(event);
        /* array[0] holds the actual length for the discarded event */
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
@@ -2606,6 +2633,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        return local_read(&cpu_buffer->entries) -
+                (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
 /**
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
 * @buffer: The ring buffer
@@ -2614,16 +2654,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned long ret;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 0;
        cpu_buffer = buffer->buffers[cpu];
-        ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-                - cpu_buffer->read;
-        return ret;
+        return rb_num_of_entries(cpu_buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
@@ -2684,8 +2721,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-                entries += (local_read(&cpu_buffer->entries) -
+                entries += rb_num_of_entries(cpu_buffer);
-                            local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
        }
        return entries;
@@ -2896,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        /*
         * cpu_buffer->pages just needs to point to the buffer, it
         *  has no specific buffer page to point to. Lets move it out
-         *  of our way so we don't accidently swap it.
+         *  of our way so we don't accidentally swap it.
         */
        cpu_buffer->pages = reader->list.prev;
@@ -3040,12 +3076,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 again:
        /*
-         * We repeat when a timestamp is encountered. It is possible
+         * We repeat when a time extend is encountered.
-         * to get multiple timestamps from an interrupt entering just
+         * Since the time extend is always attached to a data event,
-         * as one timestamp is about to be written, or from discarded
+         * we should never loop more than once.
-         * commits. The most that we can have is the number on a single page.
+         * (We never hit the following condition more than twice).
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
        reader = rb_get_reader_page(cpu_buffer);
@@ -3121,14 +3157,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
                return NULL;
        /*
-         * We repeat when a timestamp is encountered.
+         * We repeat when a time extend is encountered.
-         * We can get multiple timestamps by nested interrupts or also
+         * Since the time extend is always attached to a data event,
-         * if filtering is on (discarding commits). Since discarding
+         * we should never loop more than once.
-         * commits can be frequent we can get a lot of timestamps.
+         * (We never hit the following condition more than twice).
-         * But we limit them by not adding timestamps if they begin
-         * at the start of a page.
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
        if (rb_per_cpu_empty(cpu_buffer))
@@ -3826,7 +3860,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                if (len > (commit - read))
                        len = (commit - read);
-                size = rb_event_length(event);
+                /* Always keep the time extend and data together */
+                size = rb_event_ts_length(event);
                if (len < size)
                        goto out_unlock;
@@ -3836,6 +3871,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                /* Need to copy one event at a time */
                do {
+                        /* We need the size of one event, because
+                         * rb_advance_reader only advances by one event,
+                         * whereas rb_event_ts_length may include the size of
+                         * one or two events.
+                         * We have already ensured there's enough space if this
+                         * is a time extend. */
+                        size = rb_event_length(event);
                        memcpy(bpage->data + pos, rpage->data + rpos, size);
                        len -= size;
@@ -3848,8 +3890,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                                break;
                        event = rb_reader_event(cpu_buffer);
-                        size = rb_event_length(event);
+                        /* Always keep the time extend and data together */
-                } while (len > size);
+                        size = rb_event_ts_length(event);
+                } while (len >= size);
                /* update bpage */
                local_set(&bpage->commit, pos);
@@ -3965,6 +4008,7 @@ static const struct file_operations rb_simple_fops = {
        .open           = tracing_open_generic,
        .read           = rb_simple_read,
        .write          = rb_simple_write,
+        .llseek         = default_llseek,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
@@ -42,8 +41,6 @@
 #include "trace.h"
 #include "trace_output.h"
-#define TRACE_BUFFER_FLAGS      (RB_FL_OVERWRITE)
 /*
 * On boot up, the ring buffer is set to the minimum size, so that
 * we do not waste memory on systems that are not using tracing.
@@ -341,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -426,6 +423,7 @@ static const char *trace_options[] = {
        "sleep-time",
        "graph-time",
        "record-cmd",
+        "overwrite",
        NULL
 };
@@ -781,6 +779,11 @@ __acquires(kernel_lock)
                tracing_reset_online_cpus(tr);
                current_trace = type;
+                /* If we expanded the buffers, make sure the max is expanded too */
+                if (ring_buffer_expanded && type->use_max_tr)
+                        ring_buffer_resize(max_tr.buffer, trace_buf_size);
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
                ret = type->selftest(type, tr);
@@ -793,6 +796,10 @@ __acquires(kernel_lock)
                /* Only reset on passing, to avoid touching corrupted buffers */
                tracing_reset_online_cpus(tr);
+                /* Shrink the max buffer again */
+                if (ring_buffer_expanded && type->use_max_tr)
+                        ring_buffer_resize(max_tr.buffer, 1);
                printk(KERN_CONT "PASSED\n");
        }
 #endif
@@ -1103,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
-        entry->lock_depth               = (tsk) ? tsk->lock_depth : 0;
+        entry->padding                  = 0;
        entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1284,6 +1291,8 @@ void trace_dump_stack(void)
        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
+static DEFINE_PER_CPU(int, user_stack_count);
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1302,10 +1311,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (unlikely(in_nmi()))
                return;
+        /*
+         * prevent recursion, since the user stack tracing may
+         * trigger other kernel events.
+         */
+        preempt_disable();
+        if (__this_cpu_read(user_stack_count))
+                goto out;
+        __this_cpu_inc(user_stack_count);
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
-                return;
+                goto out_drop_count;
        entry   = ring_buffer_event_data(event);
        entry->tgid             = current->tgid;
@@ -1319,6 +1338,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
+        __this_cpu_dec(user_stack_count);
+ out:
+        preempt_enable();
 }
 #ifdef UNUSED
@@ -1733,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#                | / _----=> need-resched    \n");
        seq_puts(m, "#                || / _---=> hardirq/softirq \n");
        seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-        seq_puts(m, "#                |||| /_--=> lock-depth       \n");
+        seq_puts(m, "#                |||| /     delay             \n");
-        seq_puts(m, "#                |||||/     delay             \n");
+        seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-        seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n");
+        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
-        seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");
 }
 static void print_func_help_header(struct seq_file *m)
@@ -1991,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
-        if (iter->lost_events)
+        if (iter->lost_events &&
-                trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+            !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
-                                 iter->cpu, iter->lost_events);
+                                 iter->cpu, iter->lost_events))
+                return TRACE_TYPE_PARTIAL_LINE;
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
@@ -2196,7 +2220,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 static int tracing_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
        int cpu;
@@ -2320,11 +2344,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
        return count;
 }
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+        if (file->f_mode & FMODE_READ)
+                return seq_lseek(file, offset, origin);
+        else
+                return 0;
+}
 static const struct file_operations tracing_fops = {
        .open           = tracing_open,
        .read           = seq_read,
        .write          = tracing_write_stub,
-        .llseek         = seq_lseek,
+        .llseek         = tracing_seek,
        .release        = tracing_release,
 };
@@ -2505,6 +2537,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);
+        if (mask == TRACE_ITER_OVERWRITE)
+                ring_buffer_change_overwrite(global_trace.buffer, enabled);
 }
 static ssize_t
@@ -2686,6 +2721,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
        mutex_lock(&trace_types_lock);
        if (tracer_enabled ^ val) {
+                /* Only need to warn if this is used to change the state */
+                WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
                if (val) {
                        tracer_enabled = 1;
                        if (current_trace->start)
@@ -3192,6 +3231,14 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
+                /*
+                 * Setting the full flag means we reached the trace_seq buffer
+                 * size and we should leave by partial output condition above.
+                 * One of the trace_seq_* functions is not used properly.
+                 */
+                WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
+                          iter->ent->type);
        }
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
@@ -3202,7 +3249,7 @@ waitagain:
                trace_seq_init(&iter->seq);
        /*
-         * If there was nothing to send to user, inspite of consuming trace
+         * If there was nothing to send to user, in spite of consuming trace
         * entries, go back to wait for more entries.
         */
        if (sret == -EBUSY)
@@ -3996,13 +4043,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu();
        struct dentry *d_cpu;
-        /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+        char cpu_dir[30]; /* 30 characters should be more than enough */
-        char cpu_dir[7];
-        if (cpu > 999 || cpu < 0)
+        snprintf(cpu_dir, 30, "cpu%ld", cpu);
-                return;
-        sprintf(cpu_dir, "cpu%ld", cpu);
        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
                pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
@@ -4531,9 +4574,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 __init static int tracer_alloc_buffers(void)
 {
        int ring_buf_size;
+        enum ring_buffer_flags rb_flags;
        int i;
        int ret = -ENOMEM;
        if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
                goto out;
@@ -4546,12 +4591,13 @@ __init static int tracer_alloc_buffers(void)
        else
                ring_buf_size = 1;
+        rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
        cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
        cpumask_copy(tracing_cpumask, cpu_all_mask);
        /* TODO: make the number of buffers hot pluggable with CPUS */
-        global_trace.buffer = ring_buffer_alloc(ring_buf_size,
+        global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
-                                                   TRACE_BUFFER_FLAGS);
        if (!global_trace.buffer) {
                printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
                WARN_ON(1);
@@ -4561,7 +4607,7 @@ __init static int tracer_alloc_buffers(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
-        max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
+        max_tr.buffer = ring_buffer_alloc(1, rb_flags);
        if (!max_tr.buffer) {
                printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
                WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
        /* If you handled the flag setting, return 0 */
        int                     (*set_flag)(u32 old_flags, u32 bit, int set);
        struct tracer           *next;
-        int                     print_max;
        struct tracer_flags     *flags;
+        int                     print_max;
        int                     use_max_tr;
 };
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+                    unsigned long ip,
+                    unsigned long parent_ip,
+                    unsigned long flags, int pc);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
@@ -415,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
 extern unsigned long ftrace_update_tot_cnt;
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
 extern int DYN_FTRACE_TEST_NAME(void);
+#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
+extern int DYN_FTRACE_TEST_NAME2(void);
 #endif
 extern int ring_buffer_expanded;
@@ -602,6 +608,7 @@ enum trace_iterator_flags {
        TRACE_ITER_SLEEP_TIME           = 0x40000,
        TRACE_ITER_GRAPH_TIME           = 0x80000,
        TRACE_ITER_RECORD_CMD           = 0x100000,
+        TRACE_ITER_OVERWRITE            = 0x200000,
 };
 /*
@@ -657,8 +664,10 @@ struct ftrace_event_field {
 };
 struct event_filter {
-        int                     n_preds;
+        int                     n_preds;        /* Number assigned */
-        struct filter_pred      **preds;
+        int                     a_preds;        /* allocated */
+        struct filter_pred      *preds;
+        struct filter_pred      *root;
        char                    *filter_string;
 };
@@ -670,11 +679,23 @@ struct event_subsystem {
        int                     nr_events;
 };
+#define FILTER_PRED_INVALID     ((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT    (1 << 15)
+#define FILTER_PRED_FOLD        (1 << 15)
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED         16384
 struct filter_pred;
 struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-                                 int val1, int val2);
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
@@ -696,11 +717,23 @@ struct filter_pred {
        filter_pred_fn_t        fn;
        u64                     val;
        struct regex            regex;
-        char                    *field_name;
+        /*
+         * Leaf nodes use field_name, ops is used by AND and OR
+         * nodes. The field_name is always freed when freeing a pred.
+         * We can overload field_name for ops and have it freed
+         * as well.
+         */
+        union {
+                char            *field_name;
+                unsigned short  *ops;
+        };
        int                     offset;
        int                     not;
        int                     op;
-        int                     pop_n;
+        unsigned short          index;
+        unsigned short          parent;
+        unsigned short          left;
+        unsigned short          right;
 };
 extern struct list_head ftrace_common_fields;
@@ -751,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
 #include "trace_entries.h"
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT              (1<<11)
+#define TRACE_GLOBAL_BIT                (1<<12)
+#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
 }
 /*
- * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * trace_clock(): 'between' trace clock. Not completely serialized,
 * but not completely incorrect when crossing CPUs either.
 *
 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
 *        in the structure.
 *
 *   * for structures within structures, the format of the internal
- *      structure is layed out. This allows the internal structure
+ *      structure is laid out. This allows the internal structure
 *      to be deciphered for the format file. Although these macros
 *      may become out of sync with the internal structure, they
 *      will create a compile error if it happens. Since the
@@ -53,7 +53,7 @@
 */
 /*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
 */
 FTRACE_ENTRY(function, ftrace_entry,
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
 */
 #define FTRACE_CTX_FIELDS                                       \
        __field(        unsigned int,   prev_pid        )       \
+        __field(        unsigned int,   next_pid        )       \
+        __field(        unsigned int,   next_cpu        )       \
        __field(        unsigned char,  prev_prio       )       \
        __field(        unsigned char,  prev_state      )       \
-        __field(        unsigned int,   next_pid        )       \
        __field(        unsigned char,  next_prio       )       \
-        __field(        unsigned char,  next_state      )       \
+        __field(        unsigned char,  next_state      )
-        __field(        unsigned int,   next_cpu        )
 FTRACE_ENTRY(context_switch, ctx_switch_entry,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf2..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
-static char *perf_trace_buf[4];
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 /*
 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 /* Count the events in use (per event id, not per instance) */
 static int      total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+                                 struct perf_event *p_event)
+{
+        /* No tracing, just counting, so no obvious leak */
+        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+                return 0;
+        /* Some events are ok to be traced by non-root users... */
+        if (p_event->attach_state == PERF_ATTACH_TASK) {
+                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+                        return 0;
+        }
+        /*
+         * ...otherwise raw tracepoint data can be a severe data leak,
+         * only allow root to have these.
+         */
+        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return 0;
+}
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
-        struct hlist_head *list;
+        struct hlist_head __percpu *list;
-        int ret = -ENOMEM;
+        int ret;
        int cpu;
+        ret = perf_trace_event_perm(tp_event, p_event);
+        if (ret)
+                return ret;
        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;
+        ret = -ENOMEM;
        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;
@@ -42,11 +71,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
        tp_event->perf_events = list;
        if (!total_ref_count) {
-                char *buf;
+                char __percpu *buf;
                int i;
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-                        buf = (char *)alloc_percpu(perf_trace_t);
+                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
                        if (!buf)
                                goto fail;
@@ -65,7 +94,7 @@ fail:
        if (!total_ref_count) {
                int i;
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
@@ -101,22 +130,26 @@ int perf_trace_init(struct perf_event *p_event)
        return ret;
 }
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
 {
        struct ftrace_event_call *tp_event = p_event->tp_event;
+        struct hlist_head __percpu *pcpu_list;
        struct hlist_head *list;
-        list = tp_event->perf_events;
+        pcpu_list = tp_event->perf_events;
-        if (WARN_ON_ONCE(!list))
+        if (WARN_ON_ONCE(!pcpu_list))
                return -EINVAL;
-        list = this_cpu_ptr(list);
+        if (!(flags & PERF_EF_START))
+                p_event->hw.state = PERF_HES_STOPPED;
+        list = this_cpu_ptr(pcpu_list);
        hlist_add_head_rcu(&p_event->hlist_entry, list);
        return 0;
 }
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
 {
        hlist_del_rcu(&p_event->hlist_entry);
 }
@@ -142,7 +175,7 @@ void perf_trace_destroy(struct perf_event *p_event)
        tp_event->perf_events = NULL;
        if (!--total_ref_count) {
-                for (i = 0; i < 4; i++) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
 DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
@@ -110,7 +116,7 @@ static int trace_define_common_fields(void)
        __common_field(unsigned char, flags);
        __common_field(unsigned char, preempt_count);
        __common_field(int, pid);
-        __common_field(int, lock_depth);
+        __common_field(int, padding);
        return ret;
 }
@@ -320,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
 {
        return __ftrace_set_clr_event(NULL, system, event, set);
 }
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE          127
@@ -600,21 +607,29 @@ out:
 enum {
        FORMAT_HEADER           = 1,
-        FORMAT_PRINTFMT         = 2,
+        FORMAT_FIELD_SEPERATOR  = 2,
+        FORMAT_PRINTFMT         = 3,
 };
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct ftrace_event_call *call = m->private;
        struct ftrace_event_field *field;
-        struct list_head *head;
+        struct list_head *common_head = &ftrace_common_fields;
+        struct list_head *head = trace_get_fields(call);
        (*pos)++;
        switch ((unsigned long)v) {
        case FORMAT_HEADER:
-                head = &ftrace_common_fields;
+                if (unlikely(list_empty(common_head)))
+                        return NULL;
+                field = list_entry(common_head->prev,
+                                   struct ftrace_event_field, link);
+                return field;
+        case FORMAT_FIELD_SEPERATOR:
                if (unlikely(list_empty(head)))
                        return NULL;
@@ -626,31 +641,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
                return NULL;
        }
-        head = trace_get_fields(call);
-        /*
-         * To separate common fields from event fields, the
-         * LSB is set on the first event field. Clear it in case.
-         */
-        v = (void *)((unsigned long)v & ~1L);
        field = v;
-        /*
+        if (field->link.prev == common_head)
-         * If this is a common field, and at the end of the list, then
+                return (void *)FORMAT_FIELD_SEPERATOR;
-         * continue with main list.
+        else if (field->link.prev == head)
-         */
-        if (field->link.prev == &ftrace_common_fields) {
-                if (unlikely(list_empty(head)))
-                        return NULL;
-                field = list_entry(head->prev, struct ftrace_event_field, link);
-                /* Set the LSB to notify f_show to print an extra newline */
-                field = (struct ftrace_event_field *)
-                        ((unsigned long)field | 1);
-                return field;
-        }
-        /* If we are done tell f_show to print the format */
-        if (field->link.prev == head)
                return (void *)FORMAT_PRINTFMT;
        field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +682,16 @@ static int f_show(struct seq_file *m, void *v)
                seq_printf(m, "format:\n");
                return 0;
+        case FORMAT_FIELD_SEPERATOR:
+                seq_putc(m, '\n');
+                return 0;
        case FORMAT_PRINTFMT:
                seq_printf(m, "\nprint fmt: %s\n",
                           call->print_fmt);
                return 0;
        }
-        /*
-         * To separate common fields from event fields, the
-         * LSB is set on the first event field. Clear it and
-         * print a newline if it is set.
-         */
-        if ((unsigned long)v & 1) {
-                seq_putc(m, '\n');
-                v = (void *)((unsigned long)v & ~1L);
-        }
        field = v;
        /*
@@ -951,6 +939,7 @@ static const struct file_operations ftrace_enable_fops = {
        .open = tracing_open_generic,
        .read = event_enable_read,
        .write = event_enable_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_event_format_fops = {
@@ -963,29 +952,34 @@ static const struct file_operations ftrace_event_format_fops = {
 static const struct file_operations ftrace_event_id_fops = {
        .open = tracing_open_generic,
        .read = event_id_read,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_event_filter_fops = {
        .open = tracing_open_generic,
        .read = event_filter_read,
        .write = event_filter_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_subsystem_filter_fops = {
        .open = tracing_open_generic,
        .read = subsystem_filter_read,
        .write = subsystem_filter_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_system_enable_fops = {
        .open = tracing_open_generic,
        .read = system_enable_read,
        .write = system_enable_write,
+        .llseek = default_llseek,
 };
 static const struct file_operations ftrace_show_header_fops = {
        .open = tracing_open_generic,
        .read = show_header,
+        .llseek = default_llseek,
 };
 static struct dentry *event_trace_events_dir(void)
@@ -1291,7 +1285,7 @@ trace_create_file_ops(struct module *mod)
 static void trace_module_add_events(struct module *mod)
 {
        struct ftrace_module_file_ops *file_ops = NULL;
-        struct ftrace_event_call *call, *start, *end;
+        struct ftrace_event_call **call, **start, **end;
        start = mod->trace_events;
        end = mod->trace_events + mod->num_trace_events;
@@ -1304,7 +1298,7 @@ static void trace_module_add_events(struct module *mod)
                return;
        for_each_event(call, start, end) {
-                __trace_add_event_call(call, mod,
+                __trace_add_event_call(*call, mod,
                                       &file_ops->id, &file_ops->enable,
                                       &file_ops->filter, &file_ops->format);
        }
@@ -1374,8 +1368,8 @@ static struct notifier_block trace_module_nb = {
        .priority = 0,
 };
-extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
 static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
@@ -1391,7 +1385,7 @@ __setup("trace_event=", setup_trace_event);
 static __init int event_trace_init(void)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_call **call;
        struct dentry *d_tracer;
        struct dentry *entry;
        struct dentry *d_events;
@@ -1437,7 +1431,7 @@ static __init int event_trace_init(void)
                pr_warning("tracing: Failed to allocate common fields");
        for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-                __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+                __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
                                       &ftrace_enable_fops,
                                       &ftrace_event_filter_fops,
                                       &ftrace_event_format_fops);
@@ -1663,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata  =
 static __init void event_trace_self_test_with_function(void)
 {
-        register_ftrace_function(&trace_ops);
+        int ret;
+        ret = register_ftrace_function(&trace_ops);
+        if (WARN_ON(ret < 0)) {
+                pr_info("Failed to enable function tracer for event tests\n");
+                return;
+        }
        pr_info("Running tests again, along with the function tracer\n");
        event_trace_self_tests();
        unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
        } operand;
 };
+struct pred_stack {
+        struct filter_pred      **preds;
+        int                     index;
+};
 #define DEFINE_COMPARISON_PRED(type)                                    \
-static int filter_pred_##type(struct filter_pred *pred, void *event,    \
+static int filter_pred_##type(struct filter_pred *pred, void *event)    \
-                              int val1, int val2)                       \
 {                                                                       \
        type *addr = (type *)(event + pred->offset);                    \
        type val = (type)pred->val;                                     \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event,	\
 }
 #define DEFINE_EQUALITY_PRED(size)                                      \
-static int filter_pred_##size(struct filter_pred *pred, void *event,    \
+static int filter_pred_##size(struct filter_pred *pred, void *event)    \
-                              int val1, int val2)                       \
 {                                                                       \
        u##size *addr = (u##size *)(event + pred->offset);              \
        u##size val = (u##size)pred->val;                               \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
 DEFINE_EQUALITY_PRED(16);
 DEFINE_EQUALITY_PRED(8);
-static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
-                           void *event __attribute((unused)),
-                           int val1, int val2)
-{
-        return val1 && val2;
-}
-static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
-                          void *event __attribute((unused)),
-                          int val1, int val2)
-{
-        return val1 || val2;
-}
 /* Filter predicate for fixed sized arrays of characters */
-static int filter_pred_string(struct filter_pred *pred, void *event,
+static int filter_pred_string(struct filter_pred *pred, void *event)
-                              int val1, int val2)
 {
        char *addr = (char *)(event + pred->offset);
        int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 }
 /* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event,
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
-                             int val1, int val2)
 {
        char **addr = (char **)(event + pred->offset);
        int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
 * and add it to the address of the entry, and at last we have
 * the address of the string.
 */
-static int filter_pred_strloc(struct filter_pred *pred, void *event,
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
-                              int val1, int val2)
 {
        u32 str_item = *(u32 *)(event + pred->offset);
        int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
        return match;
 }
-static int filter_pred_none(struct filter_pred *pred, void *event,
+static int filter_pred_none(struct filter_pred *pred, void *event)
-                            int val1, int val2)
 {
        return 0;
 }
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
        pred->not ^= not;
 }
+enum move_type {
+        MOVE_DOWN,
+        MOVE_UP_FROM_LEFT,
+        MOVE_UP_FROM_RIGHT
+};
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+                int index, enum move_type *move)
+{
+        if (pred->parent & FILTER_PRED_IS_RIGHT)
+                *move = MOVE_UP_FROM_RIGHT;
+        else
+                *move = MOVE_UP_FROM_LEFT;
+        pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+        return pred;
+}
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+                       struct filter_pred *op, void *rec)
+{
+        struct filter_pred *pred;
+        int match = 0;
+        int type;
+        int i;
+        /*
+         * Micro-optimization: We set type to true if op
+         * is an OR and false otherwise (AND). Then we
+         * just need to test if the match is equal to
+         * the type, and if it is, we can short circuit the
+         * rest of the checks:
+         *
+         * if ((match && op->op == OP_OR) ||
+         *     (!match && op->op == OP_AND))
+         *        return match;
+         */
+        type = op->op == OP_OR;
+        for (i = 0; i < op->val; i++) {
+                pred = &preds[op->ops[i]];
+                match = pred->fn(pred, rec);
+                if (!!match == type)
+                        return match;
+        }
+        return match;
+}
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-        int match, top = 0, val1 = 0, val2 = 0;
+        int match = -1;
-        int stack[MAX_FILTER_PRED];
+        enum move_type move = MOVE_DOWN;
+        struct filter_pred *preds;
        struct filter_pred *pred;
-        int i;
+        struct filter_pred *root;
+        int n_preds;
+        int done = 0;
+        /* no filter is considered a match */
+        if (!filter)
+                return 1;
+        n_preds = filter->n_preds;
+        if (!n_preds)
+                return 1;
+        /*
+         * n_preds, root and filter->preds are protect with preemption disabled.
+         */
+        preds = rcu_dereference_sched(filter->preds);
+        root = rcu_dereference_sched(filter->root);
+        if (!root)
+                return 1;
+        pred = root;
-        for (i = 0; i < filter->n_preds; i++) {
+        /* match is currently meaningless */
-                pred = filter->preds[i];
+        match = -1;
-                if (!pred->pop_n) {
-                        match = pred->fn(pred, rec, val1, val2);
+        do {
-                        stack[top++] = match;
+                switch (move) {
+                case MOVE_DOWN:
+                        /* only AND and OR have children */
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                /* If ops is set, then it was folded. */
+                                if (!pred->ops) {
+                                        /* keep going to down the left side */
+                                        pred = &preds[pred->left];
+                                        continue;
+                                }
+                                /* We can treat folded ops as a leaf node */
+                                match = process_ops(preds, pred, rec);
+                        } else
+                                match = pred->fn(pred, rec);
+                        /* If this pred is the only pred */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        /*
+                         * Check for short circuits.
+                         *
+                         * Optimization: !!match == (pred->op == OP_OR)
+                         *   is the same as:
+                         * if ((match && pred->op == OP_OR) ||
+                         *     (!match && pred->op == OP_AND))
+                         */
+                        if (!!match == (pred->op == OP_OR)) {
+                                if (pred == root)
+                                        break;
+                                pred = get_pred_parent(pred, preds,
+                                                       pred->parent, &move);
+                                continue;
+                        }
+                        /* now go down the right side of the tree. */
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        /* We finished this equation. */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
                        continue;
                }
-                if (pred->pop_n > top) {
+                done = 1;
-                        WARN_ON_ONCE(1);
+        } while (!done);
-                        return 0;
-                }
-                val1 = stack[--top];
-                val2 = stack[--top];
-                match = pred->fn(pred, rec, val1, val2);
-                stack[top++] = match;
-        }
-        return stack[--top];
+        return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
 static void remove_filter_string(struct event_filter *filter)
 {
+        if (!filter)
+                return;
        kfree(filter->filter_string);
        filter->filter_string = NULL;
 }
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
 void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-        struct event_filter *filter = call->filter;
+        struct event_filter *filter;
        mutex_lock(&event_mutex);
+        filter = call->filter;
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 void print_subsystem_event_filter(struct event_subsystem *system,
                                  struct trace_seq *s)
 {
-        struct event_filter *filter = system->filter;
+        struct event_filter *filter;
        mutex_lock(&event_mutex);
+        filter = system->filter;
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
        pred->regex.len = 0;
 }
-static int filter_set_pred(struct filter_pred *dest,
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+        stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+        if (!stack->preds)
+                return -ENOMEM;
+        stack->index = n_preds;
+        return 0;
+}
+static void __free_pred_stack(struct pred_stack *stack)
+{
+        kfree(stack->preds);
+        stack->index = 0;
+}
+static int __push_pred_stack(struct pred_stack *stack,
+                             struct filter_pred *pred)
+{
+        int index = stack->index;
+        if (WARN_ON(index == 0))
+                return -ENOSPC;
+        stack->preds[--index] = pred;
+        stack->index = index;
+        return 0;
+}
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+        struct filter_pred *pred;
+        int index = stack->index;
+        pred = stack->preds[index++];
+        if (!pred)
+                return NULL;
+        stack->index = index;
+        return pred;
+}
+static int filter_set_pred(struct event_filter *filter,
+                           int idx,
+                           struct pred_stack *stack,
                           struct filter_pred *src,
                           filter_pred_fn_t fn)
 {
+        struct filter_pred *dest = &filter->preds[idx];
+        struct filter_pred *left;
+        struct filter_pred *right;
        *dest = *src;
        if (src->field_name) {
                dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
                        return -ENOMEM;
        }
        dest->fn = fn;
+        dest->index = idx;
-        return 0;
+        if (dest->op == OP_OR || dest->op == OP_AND) {
+                right = __pop_pred_stack(stack);
+                left = __pop_pred_stack(stack);
+                if (!left || !right)
+                        return -EINVAL;
+                /*
+                 * If both children can be folded
+                 * and they are the same op as this op or a leaf,
+                 * then this op can be folded.
+                 */
+                if (left->index & FILTER_PRED_FOLD &&
+                    (left->op == dest->op ||
+                     left->left == FILTER_PRED_INVALID) &&
+                    right->index & FILTER_PRED_FOLD &&
+                    (right->op == dest->op ||
+                     right->left == FILTER_PRED_INVALID))
+                        dest->index |= FILTER_PRED_FOLD;
+                dest->left = left->index & ~FILTER_PRED_FOLD;
+                dest->right = right->index & ~FILTER_PRED_FOLD;
+                left->parent = dest->index & ~FILTER_PRED_FOLD;
+                right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+        } else {
+                /*
+                 * Make dest->left invalid to be used as a quick
+                 * way to know this is a leaf node.
+                 */
+                dest->left = FILTER_PRED_INVALID;
+                /* All leafs allow folding the parent ops. */
+                dest->index |= FILTER_PRED_FOLD;
+        }
+        return __push_pred_stack(stack, dest);
 }
-static void filter_disable_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-        struct event_filter *filter = call->filter;
        int i;
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
+        if (filter->preds) {
+                for (i = 0; i < filter->a_preds; i++)
+                        kfree(filter->preds[i].field_name);
+                kfree(filter->preds);
+                filter->preds = NULL;
+        }
+        filter->a_preds = 0;
        filter->n_preds = 0;
-        for (i = 0; i < MAX_FILTER_PRED; i++)
-                filter->preds[i]->fn = filter_pred_none;
 }
-static void __free_preds(struct event_filter *filter)
+static void filter_disable(struct ftrace_event_call *call)
 {
-        int i;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
+}
+static void __free_filter(struct event_filter *filter)
+{
        if (!filter)
                return;
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+        __free_preds(filter);
-                if (filter->preds[i])
-                        filter_free_pred(filter->preds[i]);
-        }
-        kfree(filter->preds);
        kfree(filter->filter_string);
        kfree(filter);
 }
+/*
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
+ * the tracepoints from within it.
+ */
 void destroy_preds(struct ftrace_event_call *call)
 {
-        __free_preds(call->filter);
+        __free_filter(call->filter);
        call->filter = NULL;
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
-static struct event_filter *__alloc_preds(void)
+static struct event_filter *__alloc_filter(void)
 {
        struct event_filter *filter;
+        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+        return filter;
+}
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
        struct filter_pred *pred;
        int i;
-        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+        if (filter->preds)
-        if (!filter)
+                __free_preds(filter);
-                return ERR_PTR(-ENOMEM);
-        filter->n_preds = 0;
+        filter->preds =
+                kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
-        filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
        if (!filter->preds)
-                goto oom;
+                return -ENOMEM;
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+        filter->a_preds = n_preds;
-                pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+        filter->n_preds = 0;
-                if (!pred)
-                        goto oom;
+        for (i = 0; i < n_preds; i++) {
+                pred = &filter->preds[i];
                pred->fn = filter_pred_none;
-                filter->preds[i] = pred;
        }
-        return filter;
-oom:
-        __free_preds(filter);
-        return ERR_PTR(-ENOMEM);
-}
-static int init_preds(struct ftrace_event_call *call)
-{
-        if (call->filter)
-                return 0;
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
-        call->filter = __alloc_preds();
-        if (IS_ERR(call->filter))
-                return PTR_ERR(call->filter);
        return 0;
 }
-static int init_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
        struct ftrace_event_call *call;
-        int err;
        list_for_each_entry(call, &ftrace_events, list) {
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                err = init_preds(call);
+                filter_disable(call);
-                if (err)
+                remove_filter_string(call->filter);
-                        return err;
        }
-        return 0;
 }
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
 {
        struct ftrace_event_call *call;
        list_for_each_entry(call, &ftrace_events, list) {
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
+                __free_filter(call->filter);
-                filter_disable_preds(call);
+                call->filter = NULL;
-                remove_filter_string(call->filter);
        }
 }
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
                              struct ftrace_event_call *call,
                              struct event_filter *filter,
                              struct filter_pred *pred,
+                              struct pred_stack *stack,
                              filter_pred_fn_t fn)
 {
        int idx, err;
-        if (filter->n_preds == MAX_FILTER_PRED) {
+        if (WARN_ON(filter->n_preds == filter->a_preds)) {
                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
                return -ENOSPC;
        }
        idx = filter->n_preds;
-        filter_clear_pred(filter->preds[idx]);
+        filter_clear_pred(&filter->preds[idx]);
-        err = filter_set_pred(filter->preds[idx], pred, fn);
+        err = filter_set_pred(filter, idx, stack, pred, fn);
        if (err)
                return err;
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
                           struct ftrace_event_call *call,
                           struct event_filter *filter,
                           struct filter_pred *pred,
+                           struct pred_stack *stack,
                           bool dry_run)
 {
        struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
        unsigned long long val;
        int ret;
-        pred->fn = filter_pred_none;
+        fn = pred->fn = filter_pred_none;
-        if (pred->op == OP_AND) {
+        if (pred->op == OP_AND)
-                pred->pop_n = 2;
-                fn = filter_pred_and;
                goto add_pred_fn;
-        } else if (pred->op == OP_OR) {
+        else if (pred->op == OP_OR)
-                pred->pop_n = 2;
-                fn = filter_pred_or;
                goto add_pred_fn;
-        }
        field = find_event_field(call, pred->field_name);
        if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 add_pred_fn:
        if (!dry_run)
-                return filter_add_pred_fn(ps, call, filter, pred, fn);
+                return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
        return 0;
 }
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
        return 0;
 }
+static int count_preds(struct filter_parse_state *ps)
+{
+        struct postfix_elt *elt;
+        int n_preds = 0;
+        list_for_each_entry(elt, &ps->postfix, list) {
+                if (elt->op == OP_NONE)
+                        continue;
+                n_preds++;
+        }
+        return n_preds;
+}
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+                           struct filter_pred *root)
+{
+        struct filter_pred *preds;
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int done = 0;
+        int max;
+        /*
+         * The max that we can hit a node is three times.
+         * Once going down, once coming up from left, and
+         * once coming up from right. This is more than enough
+         * since leafs are only hit a single time.
+         */
+        max = 3 * filter->n_preds;
+        preds = filter->preds;
+        if  (!preds)
+                return -EINVAL;
+        pred = root;
+        do {
+                if (WARN_ON(count++ > max))
+                        return -EINVAL;
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        /* We are fine. */
+        return 0;
+}
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int done = 0;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                return 1;
+                        count++;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return count;
+}
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int children;
+        int done = 0;
+        /* No need to keep the fold flag */
+        root->index &= ~FILTER_PRED_FOLD;
+        /* If the root is a leaf then do nothing */
+        if (root->left == FILTER_PRED_INVALID)
+                return 0;
+        /* count the children */
+        children = count_leafs(preds, &preds[root->left]);
+        children += count_leafs(preds, &preds[root->right]);
+        root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+        if (!root->ops)
+                return -ENOMEM;
+        root->val = children;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        if (WARN_ON(count == children))
+                                return -EINVAL;
+                        pred->index &= ~FILTER_PRED_FOLD;
+                        root->ops[count++] = pred->index;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return 0;
+}
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+                           struct filter_pred *root)
+{
+        struct filter_pred *preds;
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int done = 0;
+        int err;
+        preds = filter->preds;
+        if  (!preds)
+                return -EINVAL;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->index & FILTER_PRED_FOLD) {
+                                err = fold_pred(preds, pred);
+                                if (err)
+                                        return err;
+                                /* Folded nodes are like leafs */
+                        } else if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return 0;
+}
 static int replace_preds(struct ftrace_event_call *call,
                         struct event_filter *filter,
                         struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
 {
        char *operand1 = NULL, *operand2 = NULL;
        struct filter_pred *pred;
+        struct filter_pred *root;
        struct postfix_elt *elt;
+        struct pred_stack stack = { }; /* init to NULL */
        int err;
        int n_preds = 0;
+        n_preds = count_preds(ps);
+        if (n_preds >= MAX_FILTER_PRED) {
+                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+                return -ENOSPC;
+        }
        err = check_preds(ps);
        if (err)
                return err;
+        if (!dry_run) {
+                err = __alloc_pred_stack(&stack, n_preds);
+                if (err)
+                        return err;
+                err = __alloc_preds(filter, n_preds);
+                if (err)
+                        goto fail;
+        }
+        n_preds = 0;
        list_for_each_entry(elt, &ps->postfix, list) {
                if (elt->op == OP_NONE) {
                        if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
                                operand2 = elt->operand;
                        else {
                                parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
-                                return -EINVAL;
+                                err = -EINVAL;
+                                goto fail;
                        }
                        continue;
                }
-                if (n_preds++ == MAX_FILTER_PRED) {
+                if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
                        parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-                        return -ENOSPC;
+                        err = -ENOSPC;
+                        goto fail;
                }
                if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
                if (!operand1 || !operand2) {
                        parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
-                        return -EINVAL;
+                        err = -EINVAL;
+                        goto fail;
                }
                pred = create_pred(elt->op, operand1, operand2);
 add_pred:
-                if (!pred)
+                if (!pred) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
-                err = filter_add_pred(ps, call, filter, pred, dry_run);
+                        goto fail;
+                }
+                err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
                filter_free_pred(pred);
                if (err)
-                        return err;
+                        goto fail;
                operand1 = operand2 = NULL;
        }
-        return 0;
+        if (!dry_run) {
+                /* We should have one item left on the stack */
+                pred = __pop_pred_stack(&stack);
+                if (!pred)
+                        return -EINVAL;
+                /* This item is where we start from in matching */
+                root = pred;
+                /* Make sure the stack is empty */
+                pred = __pop_pred_stack(&stack);
+                if (WARN_ON(pred)) {
+                        err = -EINVAL;
+                        filter->root = NULL;
+                        goto fail;
+                }
+                err = check_pred_tree(filter, root);
+                if (err)
+                        goto fail;
+                /* Optimize the tree */
+                err = fold_pred_tree(filter, root);
+                if (err)
+                        goto fail;
+                /* We don't set root until we know it works */
+                barrier();
+                filter->root = root;
+        }
+        err = 0;
+fail:
+        __free_pred_stack(&stack);
+        return err;
 }
+struct filter_list {
+        struct list_head        list;
+        struct event_filter     *filter;
+};
 static int replace_system_preds(struct event_subsystem *system,
                                struct filter_parse_state *ps,
                                char *filter_string)
 {
        struct ftrace_event_call *call;
+        struct filter_list *filter_item;
+        struct filter_list *tmp;
+        LIST_HEAD(filter_list);
        bool fail = true;
        int err;
        list_for_each_entry(call, &ftrace_events, list) {
-                struct event_filter *filter = call->filter;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                /* try to see if the filter can be applied */
+                /*
-                err = replace_preds(call, filter, ps, filter_string, true);
+                 * Try to see if the filter can be applied
+                 *  (filter arg is ignored on dry_run)
+                 */
+                err = replace_preds(call, NULL, ps, filter_string, true);
                if (err)
+                        goto fail;
+        }
+        list_for_each_entry(call, &ftrace_events, list) {
+                struct event_filter *filter;
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                /* really apply the filter */
+                filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
-                filter_disable_preds(call);
+                if (!filter_item)
-                err = replace_preds(call, filter, ps, filter_string, false);
+                        goto fail_mem;
+                list_add_tail(&filter_item->list, &filter_list);
+                filter_item->filter = __alloc_filter();
+                if (!filter_item->filter)
+                        goto fail_mem;
+                filter = filter_item->filter;
+                /* Can only fail on no memory */
+                err = replace_filter_string(filter, filter_string);
                if (err)
-                        filter_disable_preds(call);
+                        goto fail_mem;
-                else {
+                err = replace_preds(call, filter, ps, filter_string, false);
+                if (err) {
+                        filter_disable(call);
+                        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                        append_filter_err(ps, filter);
+                } else
                        call->flags |= TRACE_EVENT_FL_FILTERED;
-                        replace_filter_string(filter, filter_string);
+                /*
-                }
+                 * Regardless of if this returned an error, we still
+                 * replace the filter for the call.
+                 */
+                filter = call->filter;
+                call->filter = filter_item->filter;
+                filter_item->filter = filter;
                fail = false;
        }
-        if (fail) {
+        if (fail)
-                parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                goto fail;
-                return -EINVAL;
+        /*
+         * The calls can still be using the old filters.
+         * Do a synchronize_sched() to ensure all calls are
+         * done with them before we free them.
+         */
+        synchronize_sched();
+        list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+                __free_filter(filter_item->filter);
+                list_del(&filter_item->list);
+                kfree(filter_item);
        }
        return 0;
+ fail:
+        /* No call succeeded */
+        list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+                list_del(&filter_item->list);
+                kfree(filter_item);
+        }
+        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+        return -EINVAL;
+ fail_mem:
+        /* If any call succeeded, we still need to sync */
+        if (!fail)
+                synchronize_sched();
+        list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+                __free_filter(filter_item->filter);
+                list_del(&filter_item->list);
+                kfree(filter_item);
+        }
+        return -ENOMEM;
 }
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-        int err;
        struct filter_parse_state *ps;
+        struct event_filter *filter;
+        struct event_filter *tmp;
+        int err = 0;
        mutex_lock(&event_mutex);
-        err = init_preds(call);
-        if (err)
-                goto out_unlock;
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_disable_preds(call);
+                filter_disable(call);
-                remove_filter_string(call->filter);
+                filter = call->filter;
+                if (!filter)
+                        goto out_unlock;
+                call->filter = NULL;
+                /* Make sure the filter is not being used */
+                synchronize_sched();
+                __free_filter(filter);
                goto out_unlock;
        }
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (!ps)
                goto out_unlock;
-        filter_disable_preds(call);
+        filter = __alloc_filter();
-        replace_filter_string(call->filter, filter_string);
+        if (!filter) {
+                kfree(ps);
+                goto out_unlock;
+        }
+        replace_filter_string(filter, filter_string);
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
        if (err) {
-                append_filter_err(ps, call->filter);
+                append_filter_err(ps, filter);
                goto out;
        }
-        err = replace_preds(call, call->filter, ps, filter_string, false);
+        err = replace_preds(call, filter, ps, filter_string, false);
-        if (err)
+        if (err) {
-                append_filter_err(ps, call->filter);
+                filter_disable(call);
-        else
+                append_filter_err(ps, filter);
+        } else
                call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
+        /*
+         * Always swap the call filter with the new filter
+         * even if there was an error. If there was an error
+         * in the filter, we disable the filter and show the error
+         * string
+         */
+        tmp = call->filter;
+        call->filter = filter;
+        if (tmp) {
+                /* Make sure the call is done with the filter */
+                synchronize_sched();
+                __free_filter(tmp);
+        }
        filter_opstack_clear(ps);
        postfix_clear(ps);
        kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
-        int err;
        struct filter_parse_state *ps;
+        struct event_filter *filter;
+        int err = 0;
        mutex_lock(&event_mutex);
-        err = init_subsystem_preds(system);
-        if (err)
-                goto out_unlock;
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
+                filter = system->filter;
+                system->filter = NULL;
+                /* Ensure all filters are no longer used */
+                synchronize_sched();
+                filter_free_subsystem_filters(system);
+                __free_filter(filter);
                goto out_unlock;
        }
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        if (!ps)
                goto out_unlock;
-        replace_filter_string(system->filter, filter_string);
+        filter = __alloc_filter();
+        if (!filter)
+                goto out;
+        replace_filter_string(filter, filter_string);
+        /*
+         * No event actually uses the system filter
+         * we can free it without synchronize_sched().
+         */
+        __free_filter(system->filter);
+        system->filter = filter;
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
        struct event_filter *filter = event->filter;
        event->filter = NULL;
-        __free_preds(filter);
+        __free_filter(filter);
 }
 int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        if (event->filter)
                goto out_unlock;
-        filter = __alloc_preds();
+        filter = __alloc_filter();
-        if (IS_ERR(filter)) {
+        if (!filter) {
                err = PTR_ERR(filter);
                goto out_unlock;
        }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        err = -ENOMEM;
        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
        if (!ps)
-                goto free_preds;
+                goto free_filter;
        parse_init(ps, filter_ops, filter_str);
        err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
        postfix_clear(ps);
        kfree(ps);
-free_preds:
+free_filter:
        if (err)
-                __free_preds(filter);
+                __free_filter(filter);
 out_unlock:
        mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #undef __array
 #define __array(type, item, len)                                        \
-        BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
+        do {                                                            \
-        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
+                mutex_lock(&event_storage_mutex);                       \
+                snprintf(event_storage, sizeof(event_storage),          \
+                         "%s[%d]", #type, len);                         \
+                ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
                                 is_signed_type(type), FILTER_OTHER);   \
-        if (ret)                                                        \
+                mutex_unlock(&event_storage_mutex);                     \
-                return ret;
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0);
 #undef __array_desc
 #define __array_desc(type, container, item, len)                        \
@@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = {			\
        .fields                 = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 };                                                                      \
                                                                        \
-struct ftrace_event_call __used                                         \
+struct ftrace_event_call __used event_##call = {                        \
-__attribute__((__aligned__(4)))                                         \
-__attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
        .event.type             = etype,                                \
        .class                  = &event_class_ftrace_##call,           \
        .print_fmt              = print,                                \
 };                                                                      \
+struct ftrace_event_call __used                                         \
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = function_trace_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
        .func = function_stack_trace_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 /* Our two options */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
 #include "trace.h"
 #include "trace_output.h"
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
 struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             depth_irq;
        int             ignore;
        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 struct fgraph_data {
-        struct fgraph_cpu_data          *cpu_data;
+        struct fgraph_cpu_data __percpu *cpu_data;
        /* Place to preserve last processed entry. */
        struct ftrace_graph_ent_entry   ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+#define TRACE_GRAPH_PRINT_IRQS          0x40
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
        { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
        /* Display absolute time of an entry */
        { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+        /* Display interrupts */
+        { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
        { } /* Empty entry */
 };
 static struct tracer_flags tracer_flags = {
        /* Don't display overruns and proc by default */
        .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-               TRACE_GRAPH_PRINT_DURATION,
+               TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
        .opts = trace_opts
 };
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
        return 1;
 }
+static inline int ftrace_graph_ignore_irqs(void)
+{
+        if (!ftrace_graph_skip_irqs)
+                return 0;
+        return in_irq();
+}
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
        struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                return 0;
        /* trace it when it is-nested-in or is a function enabled. */
-        if (!(trace->depth || ftrace_graph_addr(trace->func)))
+        if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+              ftrace_graph_ignore_irqs())
                return 0;
        local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                return trace_graph_entry(trace);
 }
+static void
+__trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long flags, int pc)
+{
+        u64 time = trace_clock_local();
+        struct ftrace_graph_ent ent = {
+                .func  = ip,
+                .depth = 0,
+        };
+        struct ftrace_graph_ret ret = {
+                .func     = ip,
+                .depth    = 0,
+                .calltime = time,
+                .rettime  = time,
+        };
+        __trace_graph_entry(tr, &ent, flags, pc);
+        __trace_graph_return(tr, &ret, flags, pc);
+}
+void
+trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long parent_ip,
+                unsigned long flags, int pc)
+{
+        __trace_graph_function(tr, ip, flags, pc);
+}
 void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
        /* Print nsecs (we don't want to exceed 7 numbers) */
        if (len < 7) {
-                snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
+                size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
-                         nsecs_rem);
+                snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
                ret = trace_seq_printf(s, ".%s", nsecs_str);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
        return 0;
 }
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just entered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+                unsigned long addr, int depth)
+{
+        int cpu = iter->cpu;
+        int *depth_irq;
+        struct fgraph_data *data = iter->private;
+        /*
+         * If we are either displaying irqs, or we got called as
+         * a graph event and private data does not exist,
+         * then we bypass the irq check.
+         */
+        if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+            (!data))
+                return 0;
+        depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+        /*
+         * We are inside the irq code
+         */
+        if (*depth_irq >= 0)
+                return 1;
+        if ((addr < (unsigned long)__irqentry_text_start) ||
+            (addr >= (unsigned long)__irqentry_text_end))
+                return 0;
+        /*
+         * We are entering irq code.
+         */
+        *depth_irq = depth;
+        return 1;
+}
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+        int cpu = iter->cpu;
+        int *depth_irq;
+        struct fgraph_data *data = iter->private;
+        /*
+         * If we are either displaying irqs, or we got called as
+         * a graph event and private data does not exist,
+         * then we bypass the irq check.
+         */
+        if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+            (!data))
+                return 0;
+        depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+        /*
+         * We are not inside the irq code.
+         */
+        if (*depth_irq == -1)
+                return 0;
+        /*
+         * We are inside the irq code, and this is returning entry.
+         * Let's not trace it and clear the entry depth, since
+         * we are out of irq code.
+         *
+         * This condition ensures that we 'leave the irq code' once
+         * we are out of the entry depth. Thus protecting us from
+         * the RETURN entry loss.
+         */
+        if (*depth_irq >= depth) {
+                *depth_irq = -1;
+                return 1;
+        }
+        /*
+         * We are inside the irq code, and this is not the entry.
+         */
+        return 1;
+}
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        static enum print_line_t ret;
        int cpu = iter->cpu;
+        if (check_irq_entry(iter, flags, call->func, call->depth))
+                return TRACE_TYPE_HANDLED;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        int ret;
        int i;
+        if (check_irq_return(iter, flags, trace->depth))
+                return TRACE_TYPE_HANDLED;
        if (data) {
                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-        return print_graph_function_flags(iter, tracer_flags.val);
+        return __print_graph_function_flags(iter, tracer_flags.val);
+}
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+                                             u32 flags)
+{
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        return __print_graph_function_flags(iter, flags);
 }
 static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
        print_graph_headers_flags(s, tracer_flags.val);
 }
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+        struct trace_iterator *iter = s->private;
+        if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+                /* print nothing if the buffers are empty */
+                if (trace_empty(iter))
+                        return;
+                print_trace_header(s, iter);
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        } else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        __print_graph_headers_flags(s, flags);
+}
 void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
                *pid = -1;
                *depth = 0;
                *ignore = 0;
+                *depth_irq = -1;
        }
        iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
        }
 }
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+        if (bit == TRACE_GRAPH_PRINT_IRQS)
+                ftrace_graph_skip_irqs = !set;
+        return 0;
+}
 static struct trace_event_functions graph_functions = {
        .trace          = print_graph_function_event,
 };
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
        .print_line     = print_graph_function,
        .print_header   = print_graph_headers,
        .flags          = &tracer_flags,
+        .set_flag       = func_graph_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_function_graph,
 #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,21 +80,29 @@ static struct tracer_flags tracer_flags = {
 * skip the latency if the sequence has changed - some other section
 * did a maximum and could disturb our measurement with serial console
 * printouts, etc. Truly coinciding maximum latencies should be rare
- * and what happens together happens separately as well, so this doesnt
+ * and what happens together happens separately as well, so this doesn't
 * decrease the validity of the maximum found:
 */
 static __cacheline_aligned_in_smp       unsigned long max_sequence;
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
 */
-static void
+static int func_prolog_dec(struct trace_array *tr,
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+                           struct trace_array_cpu **data,
+                           unsigned long *flags)
 {
-        struct trace_array *tr = irqsoff_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
        long disabled;
        int cpu;
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
         */
        cpu = raw_smp_processor_id();
        if (likely(!per_cpu(tracing_cpu, cpu)))
-                return;
+                return 0;
-        local_save_flags(flags);
+        local_save_flags(*flags);
        /* slight chance to get a false positive on tracing_cpu */
-        if (!irqs_disabled_flags(flags))
+        if (!irqs_disabled_flags(*flags))
-                return;
+                return 0;
-        data = tr->data[cpu];
+        *data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
+        disabled = atomic_inc_return(&(*data)->disabled);
        if (likely(disabled == 1))
-                trace_function(tr, ip, parent_ip, flags, preempt_count());
+                return 1;
+        atomic_dec(&(*data)->disabled);
+        return 0;
+}
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        if (!func_prolog_dec(tr, &data, &flags))
+                return;
+        trace_function(tr, ip, parent_ip, flags, preempt_count());
        atomic_dec(&data->disabled);
 }
@@ -125,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = irqsoff_tracer_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -155,30 +184,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
        int ret;
-        int cpu;
        int pc;
-        cpu = raw_smp_processor_id();
+        if (!func_prolog_dec(tr, &data, &flags))
-        if (likely(!per_cpu(tracing_cpu, cpu)))
-                return 0;
-        local_save_flags(flags);
-        /* slight chance to get a false positive on tracing_cpu */
-        if (!irqs_disabled_flags(flags))
                return 0;
-        data = tr->data[cpu];
+        pc = preempt_count();
-        disabled = atomic_inc_return(&data->disabled);
+        ret = __trace_graph_entry(tr, trace, flags, pc);
-        if (likely(disabled == 1)) {
-                pc = preempt_count();
-                ret = __trace_graph_entry(tr, trace, flags, pc);
-        } else
-                ret = 0;
        atomic_dec(&data->disabled);
        return ret;
 }
@@ -187,27 +202,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
-        int cpu;
        int pc;
-        cpu = raw_smp_processor_id();
+        if (!func_prolog_dec(tr, &data, &flags))
-        if (likely(!per_cpu(tracing_cpu, cpu)))
-                return;
-        local_save_flags(flags);
-        /* slight chance to get a false positive on tracing_cpu */
-        if (!irqs_disabled_flags(flags))
                return;
-        data = tr->data[cpu];
+        pc = preempt_count();
-        disabled = atomic_inc_return(&data->disabled);
+        __trace_graph_return(tr, trace, flags, pc);
-        if (likely(disabled == 1)) {
-                pc = preempt_count();
-                __trace_graph_return(tr, trace, flags, pc);
-        }
        atomic_dec(&data->disabled);
 }
@@ -229,75 +230,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
-        u32 flags = GRAPH_TRACER_FLAGS;
-        if (trace_flags & TRACE_ITER_LATENCY_FMT)
-                flags |= TRACE_GRAPH_PRINT_DURATION;
-        else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
        /*
         * In graph mode call the graph tracer output function,
         * otherwise go with the TRACE_FN event handler
         */
        if (is_graph())
-                return print_graph_function_flags(iter, flags);
+                return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
        return TRACE_TYPE_UNHANDLED;
 }
 static void irqsoff_print_header(struct seq_file *s)
 {
-        if (is_graph()) {
+        if (is_graph())
-                struct trace_iterator *iter = s->private;
+                print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
-                u32 flags = GRAPH_TRACER_FLAGS;
+        else
-                if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-                        /* print nothing if the buffers are empty */
-                        if (trace_empty(iter))
-                                return;
-                        print_trace_header(s, iter);
-                        flags |= TRACE_GRAPH_PRINT_DURATION;
-                } else
-                        flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-                print_graph_headers_flags(s, flags);
-        } else
                trace_default_header(s);
 }
 static void
-trace_graph_function(struct trace_array *tr,
-                 unsigned long ip, unsigned long flags, int pc)
-{
-        u64 time = trace_clock_local();
-        struct ftrace_graph_ent ent = {
-                .func  = ip,
-                .depth = 0,
-        };
-        struct ftrace_graph_ret ret = {
-                .func     = ip,
-                .depth    = 0,
-                .calltime = time,
-                .rettime  = time,
-        };
-        __trace_graph_entry(tr, &ent, flags, pc);
-        __trace_graph_return(tr, &ret, flags, pc);
-}
-static void
 __trace_function(struct trace_array *tr,
                 unsigned long ip, unsigned long parent_ip,
                 unsigned long flags, int pc)
 {
-        if (!is_graph())
+        if (is_graph())
+                trace_graph_function(tr, ip, parent_ip, flags, pc);
+        else
                trace_function(tr, ip, parent_ip, flags, pc);
-        else {
-                trace_graph_function(tr, parent_ip, flags, pc);
-                trace_graph_function(tr, ip, flags, pc);
-        }
 }
 #else
@@ -495,14 +454,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 * Stubs:
 */
-void early_boot_irqs_off(void)
-{
-}
-void early_boot_irqs_on(void)
-{
-}
 void trace_softirqs_on(unsigned long ip)
 {
 }
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
 #include <linux/kdb.h>
 #include <linux/ftrace.h>
-#include "../debug/kdb/kdb_private.h"
 #include "trace.h"
 #include "trace_output.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..27d13b36b8be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
 #include <linux/perf_event.h>
 #include <linux/stringify.h>
 #include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <asm/bitsperlong.h>
 #include "trace.h"
@@ -54,7 +53,6 @@ const char *reserved_field_names[] = {
        "common_preempt_count",
        "common_pid",
        "common_tgid",
-        "common_lock_depth",
        FIELD_STRING_IP,
        FIELD_STRING_RETIP,
        FIELD_STRING_FUNC,
@@ -354,6 +352,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
        kfree(data);
 }
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+        struct fetch_param orig;
+        unsigned char hi_shift;
+        unsigned char low_shift;
+};
+#define DEFINE_FETCH_bitfield(type)                                     \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+                                            void *data, void *dest)     \
+{                                                                       \
+        struct bitfield_fetch_param *bprm = data;                       \
+        type buf = 0;                                                   \
+        call_fetch(&bprm->orig, regs, &buf);                            \
+        if (buf) {                                                      \
+                buf <<= bprm->hi_shift;                                 \
+                buf >>= bprm->low_shift;                                \
+        }                                                               \
+        *(type *)dest = buf;                                            \
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                free_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
 /* Default (unsigned long) fetch type */
 #define __DEFAULT_FETCH_TYPE(t) u##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -368,6 +403,7 @@ enum {
        FETCH_MTD_memory,
        FETCH_MTD_symbol,
        FETCH_MTD_deref,
+        FETCH_MTD_bitfield,
        FETCH_MTD_END,
 };
@@ -388,6 +424,7 @@ ASSIGN_FETCH_FUNC(retval, ftype),			\
 ASSIGN_FETCH_FUNC(memory, ftype),                       \
 ASSIGN_FETCH_FUNC(symbol, ftype),                       \
 ASSIGN_FETCH_FUNC(deref, ftype),                        \
+ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
          }                                             \
        }
@@ -431,9 +468,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
        if (!type)
                type = DEFAULT_FETCH_TYPE_STR;
+        /* Special case: bitfield */
+        if (*type == 'b') {
+                unsigned long bs;
+                type = strchr(type, '/');
+                if (!type)
+                        goto fail;
+                type++;
+                if (strict_strtoul(type, 0, &bs))
+                        goto fail;
+                switch (bs) {
+                case 8:
+                        return find_fetch_type("u8");
+                case 16:
+                        return find_fetch_type("u16");
+                case 32:
+                        return find_fetch_type("u32");
+                case 64:
+                        return find_fetch_type("u64");
+                default:
+                        goto fail;
+                }
+        }
        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
                if (strcmp(type, fetch_type_table[i].name) == 0)
                        return &fetch_type_table[i];
+fail:
        return NULL;
 }
@@ -587,7 +648,9 @@ error:
 static void free_probe_arg(struct probe_arg *arg)
 {
-        if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                free_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
                free_deref_fetch_param(arg->fetch.data);
        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
                free_symbol_cache(arg->fetch.data);
@@ -648,7 +711,7 @@ static int register_trace_probe(struct trace_probe *tp)
        }
        ret = register_probe_event(tp);
        if (ret) {
-                pr_warning("Faild to register probe event(%d)\n", ret);
+                pr_warning("Failed to register probe event(%d)\n", ret);
                goto end;
        }
@@ -768,16 +831,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
                }
                break;
        case '+':       /* deref memory */
+                arg++;  /* Skip '+', because strict_strtol() rejects it. */
        case '-':
                tmp = strchr(arg, '(');
                if (!tmp)
                        break;
                *tmp = '\0';
-                ret = strict_strtol(arg + 1, 0, &offset);
+                ret = strict_strtol(arg, 0, &offset);
                if (ret)
                        break;
-                if (arg[0] == '-')
-                        offset = -offset;
                arg = tmp + 1;
                tmp = strrchr(arg, ')');
                if (tmp) {
@@ -808,6 +870,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
        return ret;
 }
+#define BYTES_TO_BITS(nb)       ((BITS_PER_LONG * (nb)) / sizeof(long))
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+                                      const struct fetch_type *t,
+                                      struct fetch_param *f)
+{
+        struct bitfield_fetch_param *bprm;
+        unsigned long bw, bo;
+        char *tail;
+        if (*bf != 'b')
+                return 0;
+        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+        if (!bprm)
+                return -ENOMEM;
+        bprm->orig = *f;
+        f->fn = t->fetch[FETCH_MTD_bitfield];
+        f->data = (void *)bprm;
+        bw = simple_strtoul(bf + 1, &tail, 0);  /* Use simple one */
+        if (bw == 0 || *tail != '@')
+                return -EINVAL;
+        bf = tail + 1;
+        bo = simple_strtoul(bf, &tail, 0);
+        if (tail == bf || *tail != '/')
+                return -EINVAL;
+        bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+        bprm->low_shift = bprm->hi_shift + bo;
+        return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
 /* String length checking wrapper */
 static int parse_probe_arg(char *arg, struct trace_probe *tp,
                           struct probe_arg *parg, int is_return)
@@ -837,6 +934,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
        parg->offset = tp->size;
        tp->size += parg->type->size;
        ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+        if (ret >= 0 && t != NULL)
+                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
        if (ret >= 0) {
                parg->fetch_size.fn = get_fetch_size_function(parg->type,
                                                              parg->fetch.fn);
@@ -1131,7 +1230,7 @@ static int command_trace_probe(const char *buf)
        return ret;
 }
-#define WRITE_BUFSIZE 128
+#define WRITE_BUFSIZE 4096
 static ssize_t probes_write(struct file *file, const char __user *buffer,
                            size_t count, loff_t *ppos)
@@ -1739,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp)
        kfree(tp->call.print_fmt);
 }
-/* Make a debugfs interface for controling probe points */
+/* Make a debugfs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
        struct dentry *d_tracer;
@@ -1771,8 +1870,12 @@ fs_initcall(init_kprobe_trace);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+/*
-                                        int a4, int a5, int a6)
+ * The "__used" keeps gcc from removing the function symbol
+ * from the kallsyms table.
+ */
+static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                               int a4, int a5, int a6)
 {
        return a1 + a2 + a3 + a4 + a5 + a6;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+                         const struct trace_print_flags_u64 *symbol_array)
+{
+        int i;
+        const char *ret = p->buffer + p->len;
+        for (i = 0;  symbol_array[i].name; i++) {
+                if (val != symbol_array[i].mask)
+                        continue;
+                trace_seq_puts(p, symbol_array[i].name);
+                break;
+        }
+        if (!p->len)
+                trace_seq_printf(p, "0x%llx", val);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
 const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
@@ -529,24 +556,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 * @entry: The trace entry field from the ring buffer
 *
 * Prints the generic fields of irqs off, in hard or softirq, preempt
- * count and lock depth.
+ * count.
 */
 int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
-        int hardirq, softirq;
+        char hardsoft_irq;
+        char need_resched;
+        char irqs_off;
+        int hardirq;
+        int softirq;
        int ret;
        hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
        softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+        irqs_off =
+                (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+                (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+                '.';
+        need_resched =
+                (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+        hardsoft_irq =
+                (hardirq && softirq) ? 'H' :
+                hardirq ? 'h' :
+                softirq ? 's' :
+                '.';
        if (!trace_seq_printf(s, "%c%c%c",
-                              (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+                              irqs_off, need_resched, hardsoft_irq))
-                                (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
-                                  'X' : '.',
-                              (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
-                                'N' : '.',
-                              (hardirq && softirq) ? 'H' :
-                                hardirq ? 'h' : softirq ? 's' : '.'))
                return 0;
        if (entry->preempt_count)
@@ -554,13 +591,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
        else
                ret = trace_seq_putc(s, '.');
-        if (!ret)
+        return ret;
-                return 0;
-        if (entry->lock_depth < 0)
-                return trace_seq_putc(s, '.');
-        return trace_seq_printf(s, "%d", entry->lock_depth);
 }
 static int
@@ -826,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
                                  struct trace_event *event)
 {
+        if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
+                return TRACE_TYPE_PARTIAL_LINE;
        return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
 struct trace_bprintk_fmt {
        struct list_head list;
-        char fmt[0];
+        const char *fmt;
 };
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
 void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
        const char **iter;
+        char *fmt;
        mutex_lock(&btrace_mutex);
        for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
                        continue;
                }
-                tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+                tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-                                + strlen(*iter) + 1, GFP_KERNEL);
+                if (tb_fmt)
-                if (tb_fmt) {
+                        fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
+                if (tb_fmt && fmt) {
                        list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-                        strcpy(tb_fmt->fmt, *iter);
+                        strcpy(fmt, *iter);
+                        tb_fmt->fmt = fmt;
                        *iter = tb_fmt->fmt;
-                } else
+                } else {
+                        kfree(tb_fmt);
                        *iter = NULL;
+                }
        }
        mutex_unlock(&btrace_mutex);
 }
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
        return 0;
 }
+/*
+ * The debugfs/tracing/printk_formats file maps the addresses with
+ * the ASCII formats that are used in the bprintk events in the
+ * buffer. For userspace tools to be able to decode the events from
+ * the buffer, they need to be able to map the address with the format.
+ *
+ * The addresses of the bprintk formats are in their own section
+ * __trace_printk_fmt. But for modules we copy them into a link list.
+ * The code to print the formats and their addresses passes around the
+ * address of the fmt string. If the fmt address passed into the seq
+ * functions is within the kernel core __trace_printk_fmt section, then
+ * it simply uses the next pointer in the list.
+ *
+ * When the fmt pointer is outside the kernel core __trace_printk_fmt
+ * section, then we need to read the link list pointers. The trick is
+ * we pass the address of the string to the seq function just like
+ * we do for the kernel core formats. To get back the structure that
+ * holds the format, we simply use containerof() and then go to the
+ * next format in the list.
+ */
+static const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+        struct trace_bprintk_fmt *mod_fmt;
+        if (list_empty(&trace_bprintk_fmt_list))
+                return NULL;
+        /*
+         * v will point to the address of the fmt record from t_next
+         * v will be NULL from t_start.
+         * If this is the first pointer or called from start
+         * then we need to walk the list.
+         */
+        if (!v || start_index == *pos) {
+                struct trace_bprintk_fmt *p;
+                /* search the module list */
+                list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
+                        if (start_index == *pos)
+                                return &p->fmt;
+                        start_index++;
+                }
+                /* pos > index */
+                return NULL;
+        }
+        /*
+         * v points to the address of the fmt field in the mod list
+         * structure that holds the module print format.
+         */
+        mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
+        if (mod_fmt->list.next == &trace_bprintk_fmt_list)
+                return NULL;
+        mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
+        return &mod_fmt->fmt;
+}
+static void format_mod_start(void)
+{
+        mutex_lock(&btrace_mutex);
+}
+static void format_mod_stop(void)
+{
+        mutex_unlock(&btrace_mutex);
+}
 #else /* !CONFIG_MODULES */
 __init static int
 module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
 {
        return 0;
 }
+static inline const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+        return NULL;
+}
+static inline void format_mod_start(void) { }
+static inline void format_mod_stop(void) { }
 #endif /* CONFIG_MODULES */
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+static const char **find_next(void *v, loff_t *pos)
+{
+        const char **fmt = v;
+        int start_index;
+        start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
+        if (*pos < start_index)
+                return __start___trace_bprintk_fmt + *pos;
+        return find_next_mod_format(start_index, v, fmt, pos);
+}
 static void *
 t_start(struct seq_file *m, loff_t *pos)
 {
-        const char **fmt = __start___trace_bprintk_fmt + *pos;
+        format_mod_start();
+        return find_next(NULL, pos);
-        if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
-                return NULL;
-        return fmt;
 }
 static void *t_next(struct seq_file *m, void * v, loff_t *pos)
 {
        (*pos)++;
-        return t_start(m, pos);
+        return find_next(v, pos);
 }
 static int t_show(struct seq_file *m, void *v)
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v)
 static void t_stop(struct seq_file *m, void *p)
 {
+        format_mod_stop();
 }
 static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
        ctx_trace = tr;
 }
-static void stop_sched_trace(struct trace_array *tr)
-{
-        tracing_stop_sched_switch_record();
-}
-static int sched_switch_trace_init(struct trace_array *tr)
-{
-        ctx_trace = tr;
-        tracing_reset_online_cpus(tr);
-        tracing_start_sched_switch_record();
-        return 0;
-}
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
-        if (sched_ref)
-                stop_sched_trace(tr);
-}
-static void sched_switch_trace_start(struct trace_array *tr)
-{
-        sched_stopped = 0;
-}
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
-        sched_stopped = 1;
-}
-static struct tracer sched_switch_trace __read_mostly =
-{
-        .name           = "sched_switch",
-        .init           = sched_switch_trace_init,
-        .reset          = sched_switch_trace_reset,
-        .start          = sched_switch_trace_start,
-        .stop           = sched_switch_trace_stop,
-        .wait_pipe      = poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
-        .selftest    = trace_selftest_startup_sched_switch,
-#endif
-};
-__init static int init_sched_switch_trace(void)
-{
-        return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,57 +31,258 @@ static int			wakeup_rt;
 static arch_spinlock_t wakeup_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static void wakeup_reset(struct trace_array *tr);
 static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 static int save_lat_flag;
+#define TRACE_DISPLAY_GRAPH     1
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        /* display latency trace as call graph */
+        { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+        { } /* Empty entry */
+};
+static struct tracer_flags tracer_flags = {
+        .val  = 0,
+        .opts = trace_opts,
+};
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
 */
-static void
+static int
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+func_prolog_preempt_disable(struct trace_array *tr,
+                            struct trace_array_cpu **data,
+                            int *pc)
 {
-        struct trace_array *tr = wakeup_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
        long disabled;
        int cpu;
-        int pc;
        if (likely(!wakeup_task))
-                return;
+                return 0;
-        pc = preempt_count();
+        *pc = preempt_count();
        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
        if (cpu != wakeup_current_cpu)
                goto out_enable;
-        data = tr->data[cpu];
+        *data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
+        disabled = atomic_inc_return(&(*data)->disabled);
        if (unlikely(disabled != 1))
                goto out;
-        local_irq_save(flags);
+        return 1;
-        trace_function(tr, ip, parent_ip, flags, pc);
+out:
+        atomic_dec(&(*data)->disabled);
+out_enable:
+        preempt_enable_notrace();
+        return 0;
+}
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return;
+        local_irq_save(flags);
+        trace_function(tr, ip, parent_ip, flags, pc);
        local_irq_restore(flags);
- out:
        atomic_dec(&data->disabled);
- out_enable:
        preempt_enable_notrace();
 }
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = wakeup_tracer_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 #endif /* CONFIG_FUNCTION_TRACER */
+static int start_func_tracer(int graph)
+{
+        int ret;
+        if (!graph)
+                ret = register_ftrace_function(&trace_ops);
+        else
+                ret = register_ftrace_graph(&wakeup_graph_return,
+                                            &wakeup_graph_entry);
+        if (!ret && tracing_is_enabled())
+                tracer_enabled = 1;
+        else
+                tracer_enabled = 0;
+        return ret;
+}
+static void stop_func_tracer(int graph)
+{
+        tracer_enabled = 0;
+        if (!graph)
+                unregister_ftrace_function(&trace_ops);
+        else
+                unregister_ftrace_graph();
+}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+        if (!(bit & TRACE_DISPLAY_GRAPH))
+                return -EINVAL;
+        if (!(is_graph() ^ set))
+                return 0;
+        stop_func_tracer(!set);
+        wakeup_reset(wakeup_trace);
+        tracing_max_latency = 0;
+        return start_func_tracer(set);
+}
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc, ret = 0;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return 0;
+        local_save_flags(flags);
+        ret = __trace_graph_entry(tr, trace, flags, pc);
+        atomic_dec(&data->disabled);
+        preempt_enable_notrace();
+        return ret;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        int pc;
+        if (!func_prolog_preempt_disable(tr, &data, &pc))
+                return;
+        local_save_flags(flags);
+        __trace_graph_return(tr, trace, flags, pc);
+        atomic_dec(&data->disabled);
+        preempt_enable_notrace();
+        return;
+}
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+        if (is_graph())
+                graph_trace_open(iter);
+}
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+        if (iter->private)
+                graph_trace_close(iter);
+}
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+        /*
+         * In graph mode call the graph tracer output function,
+         * otherwise go with the TRACE_FN event handler
+         */
+        if (is_graph())
+                return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+        return TRACE_TYPE_UNHANDLED;
+}
+static void wakeup_print_header(struct seq_file *s)
+{
+        if (is_graph())
+                print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+        else
+                trace_default_header(s);
+}
+static void
+__trace_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long parent_ip,
+                 unsigned long flags, int pc)
+{
+        if (is_graph())
+                trace_graph_function(tr, ip, parent_ip, flags, pc);
+        else
+                trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+        return -EINVAL;
+}
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+        return -1;
+}
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+        return TRACE_TYPE_UNHANDLED;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
 * Should this new latency be reported/recorded?
 */
@@ -152,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore,
        /* The task we are waiting for is waking up */
        data = wakeup_trace->data[wakeup_cpu];
-        trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+        __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
        T0 = data->preempt_timestamp;
@@ -252,7 +453,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
         * is not called by an assembly function  (where as schedule is)
         * it should be safe to use it here.
         */
-        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+        __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 out_locked:
        arch_spin_unlock(&wakeup_lock);
@@ -303,12 +504,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
         */
        smp_wmb();
-        register_ftrace_function(&trace_ops);
+        if (start_func_tracer(is_graph()))
+                printk(KERN_ERR "failed to start wakeup tracer\n");
-        if (tracing_is_enabled())
-                tracer_enabled = 1;
-        else
-                tracer_enabled = 0;
        return;
 fail_deprobe_wake_new:
@@ -320,7 +517,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
-        unregister_ftrace_function(&trace_ops);
+        stop_func_tracer(is_graph());
        unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
        unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +576,15 @@ static struct tracer wakeup_tracer __read_mostly =
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
        .print_max      = 1,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
        .use_max_tr     = 1,
 };
@@ -394,9 +597,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .stop           = wakeup_tracer_stop,
        .wait_pipe      = poll_wait_pipe,
        .print_max      = 1,
+        .print_header   = wakeup_print_header,
+        .print_line     = wakeup_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+        .open           = wakeup_trace_open,
+        .close          = wakeup_trace_close,
        .use_max_tr     = 1,
 };
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
 #ifdef CONFIG_DYNAMIC_FTRACE
+static int trace_selftest_test_probe1_cnt;
+static void trace_selftest_test_probe1_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_probe1_cnt++;
+}
+static int trace_selftest_test_probe2_cnt;
+static void trace_selftest_test_probe2_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_probe2_cnt++;
+}
+static int trace_selftest_test_probe3_cnt;
+static void trace_selftest_test_probe3_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_probe3_cnt++;
+}
+static int trace_selftest_test_global_cnt;
+static void trace_selftest_test_global_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_global_cnt++;
+}
+static int trace_selftest_test_dyn_cnt;
+static void trace_selftest_test_dyn_func(unsigned long ip,
+                                         unsigned long pip)
+{
+        trace_selftest_test_dyn_cnt++;
+}
+static struct ftrace_ops test_probe1 = {
+        .func                   = trace_selftest_test_probe1_func,
+};
+static struct ftrace_ops test_probe2 = {
+        .func                   = trace_selftest_test_probe2_func,
+};
+static struct ftrace_ops test_probe3 = {
+        .func                   = trace_selftest_test_probe3_func,
+};
+static struct ftrace_ops test_global = {
+        .func                   = trace_selftest_test_global_func,
+        .flags                  = FTRACE_OPS_FL_GLOBAL,
+};
+static void print_counts(void)
+{
+        printk("(%d %d %d %d %d) ",
+               trace_selftest_test_probe1_cnt,
+               trace_selftest_test_probe2_cnt,
+               trace_selftest_test_probe3_cnt,
+               trace_selftest_test_global_cnt,
+               trace_selftest_test_dyn_cnt);
+}
+static void reset_counts(void)
+{
+        trace_selftest_test_probe1_cnt = 0;
+        trace_selftest_test_probe2_cnt = 0;
+        trace_selftest_test_probe3_cnt = 0;
+        trace_selftest_test_global_cnt = 0;
+        trace_selftest_test_dyn_cnt = 0;
+}
+static int trace_selftest_ops(int cnt)
+{
+        int save_ftrace_enabled = ftrace_enabled;
+        struct ftrace_ops *dyn_ops;
+        char *func1_name;
+        char *func2_name;
+        int len1;
+        int len2;
+        int ret = -1;
+        printk(KERN_CONT "PASSED\n");
+        pr_info("Testing dynamic ftrace ops #%d: ", cnt);
+        ftrace_enabled = 1;
+        reset_counts();
+        /* Handle PPC64 '.' name */
+        func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+        func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
+        len1 = strlen(func1_name);
+        len2 = strlen(func2_name);
+        /*
+         * Probe 1 will trace function 1.
+         * Probe 2 will trace function 2.
+         * Probe 3 will trace functions 1 and 2.
+         */
+        ftrace_set_filter(&test_probe1, func1_name, len1, 1);
+        ftrace_set_filter(&test_probe2, func2_name, len2, 1);
+        ftrace_set_filter(&test_probe3, func1_name, len1, 1);
+        ftrace_set_filter(&test_probe3, func2_name, len2, 0);
+        register_ftrace_function(&test_probe1);
+        register_ftrace_function(&test_probe2);
+        register_ftrace_function(&test_probe3);
+        register_ftrace_function(&test_global);
+        DYN_FTRACE_TEST_NAME();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 1)
+                goto out;
+        if (trace_selftest_test_probe2_cnt != 0)
+                goto out;
+        if (trace_selftest_test_probe3_cnt != 1)
+                goto out;
+        if (trace_selftest_test_global_cnt == 0)
+                goto out;
+        DYN_FTRACE_TEST_NAME2();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 1)
+                goto out;
+        if (trace_selftest_test_probe2_cnt != 1)
+                goto out;
+        if (trace_selftest_test_probe3_cnt != 2)
+                goto out;
+        /* Add a dynamic probe */
+        dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
+        if (!dyn_ops) {
+                printk("MEMORY ERROR ");
+                goto out;
+        }
+        dyn_ops->func = trace_selftest_test_dyn_func;
+        register_ftrace_function(dyn_ops);
+        trace_selftest_test_global_cnt = 0;
+        DYN_FTRACE_TEST_NAME();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 2)
+                goto out_free;
+        if (trace_selftest_test_probe2_cnt != 1)
+                goto out_free;
+        if (trace_selftest_test_probe3_cnt != 3)
+                goto out_free;
+        if (trace_selftest_test_global_cnt == 0)
+                goto out;
+        if (trace_selftest_test_dyn_cnt == 0)
+                goto out_free;
+        DYN_FTRACE_TEST_NAME2();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 2)
+                goto out_free;
+        if (trace_selftest_test_probe2_cnt != 2)
+                goto out_free;
+        if (trace_selftest_test_probe3_cnt != 4)
+                goto out_free;
+        ret = 0;
+ out_free:
+        unregister_ftrace_function(dyn_ops);
+        kfree(dyn_ops);
+ out:
+        /* Purposely unregister in the same order */
+        unregister_ftrace_function(&test_probe1);
+        unregister_ftrace_function(&test_probe2);
+        unregister_ftrace_function(&test_probe3);
+        unregister_ftrace_function(&test_global);
+        /* Make sure everything is off */
+        reset_counts();
+        DYN_FTRACE_TEST_NAME();
+        DYN_FTRACE_TEST_NAME();
+        if (trace_selftest_test_probe1_cnt ||
+            trace_selftest_test_probe2_cnt ||
+            trace_selftest_test_probe3_cnt ||
+            trace_selftest_test_global_cnt ||
+            trace_selftest_test_dyn_cnt)
+                ret = -1;
+        ftrace_enabled = save_ftrace_enabled;
+        return ret;
+}
 /* Test dynamic code modification and ftrace filters */
 int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
                                           struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
        /* filter only on our function */
-        ftrace_set_filter(func_name, strlen(func_name), 1);
+        ftrace_set_global_filter(func_name, strlen(func_name), 1);
        /* enable tracing */
        ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        /* check the trace buffer */
        ret = trace_test_buffer(tr, &count);
-        trace->reset(tr);
        tracing_start();
        /* we should only have one item */
        if (!ret && count != 1) {
+                trace->reset(tr);
                printk(KERN_CONT ".. filter failed count=%ld ..", count);
                ret = -1;
                goto out;
        }
+        /* Test the ops with global tracing running */
+        ret = trace_selftest_ops(1);
+        trace->reset(tr);
 out:
        ftrace_enabled = save_ftrace_enabled;
        tracer_enabled = save_tracer_enabled;
        /* Enable tracing on all functions again */
-        ftrace_set_filter(NULL, 0, 1);
+        ftrace_set_global_filter(NULL, 0, 1);
+        /* Test the ops with global tracing off */
+        if (!ret)
+                ret = trace_selftest_ops(2);
        return ret;
 }
@@ -558,7 +766,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 static int trace_wakeup_test_thread(void *data)
 {
        /* Make this a RT thread, doesn't need to be too high */
-        struct sched_param param = { .sched_priority = 5 };
+        static const struct sched_param param = { .sched_priority = 5 };
        struct completion *x = data;
        sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
        /* used to call mcount */
        return 0;
 }
+int DYN_FTRACE_TEST_NAME2(void)
+{
+        /* used to call mcount */
+        return 0;
+}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = stack_trace_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static ssize_t
@@ -195,6 +196,7 @@ static const struct file_operations stack_max_size_fops = {
        .open           = tracing_open_generic,
        .read           = stack_max_size_read,
        .write          = stack_max_size_write,
+        .llseek         = default_llseek,
 };
 static void *
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,61 +31,66 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
        return &entry->enter_fields;
 }
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
-        return &syscall_exit_fields;
-}
 struct trace_event_functions enter_syscall_print_funcs = {
-        .trace                  = print_syscall_enter,
+        .trace          = print_syscall_enter,
 };
 struct trace_event_functions exit_syscall_print_funcs = {
-        .trace                  = print_syscall_exit,
+        .trace          = print_syscall_exit,
 };
 struct ftrace_event_class event_class_syscall_enter = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_enter_register,
+        .reg            = syscall_enter_register,
-        .define_fields          = syscall_enter_define_fields,
+        .define_fields  = syscall_enter_define_fields,
-        .get_fields             = syscall_get_enter_fields,
+        .get_fields     = syscall_get_enter_fields,
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
 struct ftrace_event_class event_class_syscall_exit = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_exit_register,
+        .reg            = syscall_exit_register,
-        .define_fields          = syscall_exit_define_fields,
+        .define_fields  = syscall_exit_define_fields,
-        .get_fields             = syscall_get_exit_fields,
+        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
-extern unsigned long __start_syscalls_metadata[];
+extern struct syscall_metadata *__start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
+extern struct syscall_metadata *__stop_syscalls_metadata[];
 static struct syscall_metadata **syscalls_metadata;
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+        /*
+         * Only compare after the "sys" prefix. Archs that use
+         * syscall wrappers may have syscalls symbols aliases prefixed
+         * with "SyS" instead of "sys", leading to an unwanted
+         * mismatch.
+         */
+        return !strcmp(sym + 3, name + 3);
+}
+#endif
+static __init struct syscall_metadata *
+find_syscall_meta(unsigned long syscall)
 {
-        struct syscall_metadata *start;
+        struct syscall_metadata **start;
-        struct syscall_metadata *stop;
+        struct syscall_metadata **stop;
        char str[KSYM_SYMBOL_LEN];
-        start = (struct syscall_metadata *)__start_syscalls_metadata;
+        start = __start_syscalls_metadata;
-        stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+        stop = __stop_syscalls_metadata;
        kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+        if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+                return NULL;
        for ( ; start < stop; start++) {
-                /*
+                if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
-                 * Only compare after the "sys" prefix. Archs that use
+                        return *start;
-                 * syscall wrappers may have syscalls symbols aliases prefixed
-                 * with "SyS" instead of "sys", leading to an unwanted
-                 * mismatch.
-                 */
-                if (start->name && !strcmp(start->name + 3, str + 3))
-                        return start;
        }
        return NULL;
 }
@@ -367,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
@@ -385,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
        sys_refcount_enter--;
@@ -401,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
@@ -419,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
        sys_refcount_exit--;
@@ -432,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
+        int num;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
+        if (num < 0 || num >= NR_syscalls) {
+                pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+                                ((struct syscall_metadata *)call->data)->name);
+                return -ENOSYS;
+        }
        if (set_syscall_print_fmt(call) < 0)
                return -ENOMEM;
@@ -446,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
        return id;
 }
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
 {
        return (unsigned long)sys_call_table[nr];
 }
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
 {
        int ret, cpu;
+        for_each_possible_cpu(cpu) {
+                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+        }
        ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
        if (ret)
                goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
        if (ret)
                goto no_creation;
-        for_each_possible_cpu(cpu) {
-                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-        }
        return 0;
 no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,9 +25,10 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/jump_label.h>
-extern struct tracepoint __start___tracepoints[];
+extern struct tracepoint * const __start___tracepoints_ptrs[];
-extern struct tracepoint __stop___tracepoints[];
+extern struct tracepoint * const __stop___tracepoints_ptrs[];
 /* Set to 1 to enable tracepoint debug output */
 static const int tracepoint_debug;
@@ -250,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 {
        WARN_ON(strcmp((*entry)->name, elem->name) != 0);
-        if (elem->regfunc && !elem->state && active)
+        if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
                elem->regfunc();
-        else if (elem->unregfunc && elem->state && !active)
+        else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
                elem->unregfunc();
        /*
@@ -263,7 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
         * is used.
         */
        rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-        elem->state = active;
+        if (active && !jump_label_enabled(&elem->key))
+                jump_label_inc(&elem->key);
+        else if (!active && jump_label_enabled(&elem->key))
+                jump_label_dec(&elem->key);
 }
 /*
@@ -274,10 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 */
 static void disable_tracepoint(struct tracepoint *elem)
 {
-        if (elem->unregfunc && elem->state)
+        if (elem->unregfunc && jump_label_enabled(&elem->key))
                elem->unregfunc();
-        elem->state = 0;
+        if (jump_label_enabled(&elem->key))
+                jump_label_dec(&elem->key);
        rcu_assign_pointer(elem->funcs, NULL);
 }
@@ -288,10 +293,10 @@ static void disable_tracepoint(struct tracepoint *elem)
 *
 * Updates the probe callback corresponding to a range of tracepoints.
 */
-void
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
-tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
+                                   struct tracepoint * const *end)
 {
-        struct tracepoint *iter;
+        struct tracepoint * const *iter;
        struct tracepoint_entry *mark_entry;
        if (!begin)
@@ -299,12 +304,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
        mutex_lock(&tracepoints_mutex);
        for (iter = begin; iter < end; iter++) {
-                mark_entry = get_tracepoint(iter->name);
+                mark_entry = get_tracepoint((*iter)->name);
                if (mark_entry) {
-                        set_tracepoint(&mark_entry, iter,
+                        set_tracepoint(&mark_entry, *iter,
                                        !!mark_entry->refcount);
                } else {
-                        disable_tracepoint(iter);
+                        disable_tracepoint(*iter);
                }
        }
        mutex_unlock(&tracepoints_mutex);
@@ -316,8 +321,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 static void tracepoint_update_probes(void)
 {
        /* Core kernel tracepoints */
-        tracepoint_update_probe_range(__start___tracepoints,
+        tracepoint_update_probe_range(__start___tracepoints_ptrs,
-                __stop___tracepoints);
+                __stop___tracepoints_ptrs);
        /* tracepoints in modules. */
        module_update_tracepoints();
 }
@@ -504,8 +509,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
 * Will return the first tracepoint in the range if the input tracepoint is
 * NULL.
 */
-int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
-        struct tracepoint *begin, struct tracepoint *end)
+        struct tracepoint * const *begin, struct tracepoint * const *end)
 {
        if (!*tracepoint && begin != end) {
                *tracepoint = begin;
@@ -524,7 +529,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
        /* Core kernel tracepoints */
        if (!iter->module) {
                found = tracepoint_get_iter_range(&iter->tracepoint,
-                                __start___tracepoints, __stop___tracepoints);
+                                __start___tracepoints_ptrs,
+                                __stop___tracepoints_ptrs);
                if (found)
                        goto end;
        }
@@ -575,8 +581,8 @@ int tracepoint_module_notify(struct notifier_block *self,
        switch (val) {
        case MODULE_STATE_COMING:
        case MODULE_STATE_GOING:
-                tracepoint_update_probe_range(mod->tracepoints,
+                tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                        mod->tracepoints + mod->num_tracepoints);
+                        mod->tracepoints_ptrs + mod->num_tracepoints);
                break;
        }
        return 0;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
        stats->ac_ppid   = pid_alive(tsk) ?
                                rcu_dereference(tsk->real_parent)->tgid : 0;
        rcu_read_unlock();
-        stats->ac_utime  = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+        stats->ac_utime = cputime_to_usecs(tsk->utime);
-        stats->ac_stime  = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+        stats->ac_stime = cputime_to_usecs(tsk->stime);
-        stats->ac_utimescaled =
+        stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
-                cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+        stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
-        stats->ac_stimescaled =
-                cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
        stats->ac_minflt = tsk->min_flt;
        stats->ac_majflt = tsk->maj_flt;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!capable(CAP_SETGID))
+        if (!nsown_capable(CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 /*
 * Removes a registered user return notifier.  Must be called from atomic
- * context, and from the same cpu registration occured in.
+ * context, and from the same cpu registration occurred in.
 */
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
 struct user_namespace init_user_ns = {
        .kref = {
-                .refcount       = ATOMIC_INIT(2),
+                .refcount       = ATOMIC_INIT(3),
        },
        .creator = &root_user,
 };
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
 */
 static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
        .__count        = ATOMIC_INIT(2),
        .processes      = ATOMIC_INIT(1),
@@ -91,6 +95,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 * upon function exit.
 */
 static void free_user(struct user_struct *up, unsigned long flags)
+        __releases(&uidhash_lock)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
@@ -157,6 +162,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
+                        put_user_ns(ns);
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
 #include <linux/highuid.h>
 #include <linux/cred.h>
+static struct kmem_cache *user_ns_cachep __read_mostly;
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
        struct user_struct *root_user;
        int n;
-        ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+        ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                return -ENOMEM;
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
        /* Alloc new root user.  */
        root_user = alloc_uid(ns, 0);
        if (!root_user) {
-                kfree(ns);
+                kmem_cache_free(user_ns_cachep, ns);
                return -ENOMEM;
        }
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
        struct user_namespace *ns =
                container_of(work, struct user_namespace, destroyer);
        free_uid(ns->creator);
-        kfree(ns);
+        kmem_cache_free(user_ns_cachep, ns);
 }
 void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
        /* No useful relationship so no mapping */
        return overflowgid;
 }
+static __init int user_namespaces_init(void)
+{
+        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+        return 0;
+}
+module_init(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,8 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -30,7 +32,8 @@ static struct uts_namespace *create_uts_ns(void)
 * @old_ns: namespace to clone
 * Return NULL on error (failure to kmalloc), new ns otherwise
 */
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+                                          struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
@@ -40,6 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
        up_read(&uts_sem);
        return ns;
 }
@@ -50,8 +54,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 * utsname of this process won't be seen by parent, and vice
 * versa.
 */
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+                                   struct task_struct *tsk)
 {
+        struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
        struct uts_namespace *new_ns;
        BUG_ON(!old_ns);
@@ -60,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
        if (!(flags & CLONE_NEWUTS))
                return old_ns;
-        new_ns = clone_uts_ns(old_ns);
+        new_ns = clone_uts_ns(tsk, old_ns);
        put_uts_ns(old_ns);
        return new_ns;
@@ -71,5 +77,44 @@ void free_uts_ns(struct kref *kref)
        struct uts_namespace *ns;
        ns = container_of(kref, struct uts_namespace, kref);
+        put_user_ns(ns->user_ns);
        kfree(ns);
 }
+static void *utsns_get(struct task_struct *task)
+{
+        struct uts_namespace *ns = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy) {
+                ns = nsproxy->uts_ns;
+                get_uts_ns(ns);
+        }
+        rcu_read_unlock();
+        return ns;
+}
+static void utsns_put(void *ns)
+{
+        put_uts_ns(ns);
+}
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+        get_uts_ns(ns);
+        put_uts_ns(nsproxy->uts_ns);
+        nsproxy->uts_ns = ns;
+        return 0;
+}
+const struct proc_ns_operations utsns_operations = {
+        .name           = "uts",
+        .type           = CLONE_NEWUTS,
+        .get            = utsns_get,
+        .put            = utsns_put,
+        .install        = utsns_install,
+};
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
-/*
+/**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on
 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
-/*
+/**
 * abort_exclusive_wait - abort exclusive waiting in a queue
 * @q: waitqueue waited on
 * @wait: wait descriptor
- * @state: runstate of the waiter to be woken
+ * @mode: runstate of the waiter to be woken
 * @key: key to identify a wait bit queue or %NULL
 *
 * Sets current thread back to running state and removes
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
 * woken up through the queue.
 *
 * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and noone wakes up
+ * aborts and is woken up concurrently and no one wakes up
 * the next waiter.
 */
 void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc1..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,8 +27,8 @@
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
-int watchdog_enabled;
+int watchdog_enabled = 1;
-int __read_mostly softlockup_thresh = 60;
+int __read_mostly watchdog_thresh = 10;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -43,21 +43,22 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
-static int __read_mostly did_panic;
-static int __initdata no_watchdog;
 /* boot commands */
 /*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
 static int __init hardlockup_panic_setup(char *str)
 {
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
+        else if (!strncmp(str, "nopanic", 7))
+                hardlockup_panic = 0;
+        else if (!strncmp(str, "0", 1))
+                watchdog_enabled = 0;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -76,7 +77,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 static int __init nowatchdog_setup(char *str)
 {
-        no_watchdog = 1;
+        watchdog_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
@@ -84,12 +85,23 @@ __setup("nowatchdog", nowatchdog_setup);
 /* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-        no_watchdog = 1;
+        watchdog_enabled = 0;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh(void)
+{
+        return watchdog_thresh * 2;
+}
 /*
 * Returns seconds, approximately.  We don't need nanosecond
@@ -104,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
 static unsigned long get_sample_period(void)
 {
        /*
-         * convert softlockup_thresh from seconds to ns
+         * convert watchdog_thresh from seconds to ns
         * the divide by 5 is to give hrtimer 5 chances to
         * increment before the hardlockup detector generates
         * a warning
         */
-        return softlockup_thresh / 5 * NSEC_PER_SEC;
+        return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
 }
 /* Commands for resetting the watchdog */
@@ -117,12 +129,12 @@ static void __touch_watchdog(void)
 {
        int this_cpu = smp_processor_id();
-        __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+        __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
 }
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(watchdog_touch_ts) = 0;
+        __this_cpu_write(watchdog_touch_ts, 0);
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -166,12 +178,12 @@ void touch_softlockup_watchdog_sync(void)
 /* watchdog detector functions */
 static int is_hardlockup(void)
 {
-        unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-        if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+        if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
                return 1;
-        __get_cpu_var(hrtimer_interrupts_saved) = hrint;
+        __this_cpu_write(hrtimer_interrupts_saved, hrint);
        return 0;
 }
 #endif
@@ -181,24 +193,12 @@ static int is_softlockup(unsigned long touch_ts)
        unsigned long now = get_timestamp(smp_processor_id());
        /* Warn about unreasonable delays: */
-        if (time_after(now, touch_ts + softlockup_thresh))
+        if (time_after(now, touch_ts + get_softlockup_thresh()))
                return now - touch_ts;
        return 0;
 }
-static int
-watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-        did_panic = 1;
-        return NOTIFY_DONE;
-}
-static struct notifier_block panic_block = {
-        .notifier_call = watchdog_panic,
-};
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
@@ -209,15 +209,15 @@ static struct perf_event_attr wd_hw_attr = {
 };
 /* Callback function for perf event subsystem */
-void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                 struct perf_sample_data *data,
                 struct pt_regs *regs)
 {
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;
-        if (__get_cpu_var(watchdog_nmi_touch) == true) {
+        if (__this_cpu_read(watchdog_nmi_touch) == true) {
-                __get_cpu_var(watchdog_nmi_touch) = false;
+                __this_cpu_write(watchdog_nmi_touch, false);
                return;
        }
@@ -231,7 +231,7 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi,
                int this_cpu = smp_processor_id();
                /* only print hardlockups once */
-                if (__get_cpu_var(hard_watchdog_warn) == true)
+                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
                if (hardlockup_panic)
@@ -239,16 +239,16 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi,
                else
                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-                __get_cpu_var(hard_watchdog_warn) = true;
+                __this_cpu_write(hard_watchdog_warn, true);
                return;
        }
-        __get_cpu_var(hard_watchdog_warn) = false;
+        __this_cpu_write(hard_watchdog_warn, false);
        return;
 }
 static void watchdog_interrupt_count(void)
 {
-        __get_cpu_var(hrtimer_interrupts)++;
+        __this_cpu_inc(hrtimer_interrupts);
 }
 #else
 static inline void watchdog_interrupt_count(void) { return; }
@@ -257,7 +257,7 @@ static inline void watchdog_interrupt_count(void) { return; }
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
-        unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
@@ -265,18 +265,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        watchdog_interrupt_count();
        /* kick the softlockup detector */
-        wake_up_process(__get_cpu_var(softlockup_watchdog));
+        wake_up_process(__this_cpu_read(softlockup_watchdog));
        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
        if (touch_ts == 0) {
-                if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
-                        __get_cpu_var(softlockup_touch_sync) = false;
+                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }
                __touch_watchdog();
@@ -292,7 +292,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        duration = is_softlockup(touch_ts);
        if (unlikely(duration)) {
                /* only warn once */
-                if (__get_cpu_var(soft_watchdog_warn) == true)
+                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
                printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -307,9 +307,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
-                __get_cpu_var(soft_watchdog_warn) = true;
+                __this_cpu_write(soft_watchdog_warn, true);
        } else
-                __get_cpu_var(soft_watchdog_warn) = false;
+                __this_cpu_write(soft_watchdog_warn, false);
        return HRTIMER_RESTART;
 }
@@ -320,7 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
        sched_setscheduler(current, SCHED_FIFO, &param);
@@ -370,15 +370,22 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        wd_attr = &wd_hw_attr;
-        wd_attr->sample_period = hw_nmi_get_sample_period();
+        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-        event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
        if (!IS_ERR(event)) {
                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
                goto out_save;
        }
-        printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
-        return -1;
+        /* vary the KERN level based on the returned errno */
+        if (PTR_ERR(event) == -EOPNOTSUPP)
+                printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+        else if (PTR_ERR(event) == -ENOENT)
+                printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+        else
+                printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+        return PTR_ERR(event);
        /* success path */
 out_save:
@@ -408,31 +415,37 @@ static void watchdog_nmi_disable(int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 /* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
 {
        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
        WARN_ON(per_cpu(softlockup_watchdog, cpu));
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-        return 0;
 }
 static int watchdog_enable(int cpu)
 {
        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+        int err = 0;
        /* enable the perf event */
-        if (watchdog_nmi_enable(cpu) != 0)
+        err = watchdog_nmi_enable(cpu);
-                return -1;
+        /* Regardless of err above, fall through and start softlockup */
        /* create the watchdog thread */
        if (!p) {
                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                        return -1;
+                        if (!err) {
+                                /* if hardlockup hasn't already set this */
+                                err = PTR_ERR(p);
+                                /* and disable the perf event */
+                                watchdog_nmi_disable(cpu);
+                        }
+                        goto out;
                }
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -440,10 +453,8 @@ static int watchdog_enable(int cpu)
                wake_up_process(p);
        }
-        /* if any cpu succeeds, watchdog is considered enabled for the system */
+out:
-        watchdog_enabled = 1;
+        return err;
-        return 0;
 }
 static void watchdog_disable(int cpu)
@@ -470,12 +481,16 @@ static void watchdog_disable(int cpu)
 static void watchdog_enable_all_cpus(void)
 {
        int cpu;
-        int result = 0;
+        watchdog_enabled = 0;
        for_each_online_cpu(cpu)
-                result += watchdog_enable(cpu);
+                if (!watchdog_enable(cpu))
+                        /* if any cpu succeeds, watchdog is considered
+                           enabled for the system */
+                        watchdog_enabled = 1;
-        if (result)
+        if (!watchdog_enabled)
                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
 }
@@ -495,26 +510,25 @@ static void watchdog_disable_all_cpus(void)
 /* sysctl functions */
 #ifdef CONFIG_SYSCTL
 /*
- * proc handler for /proc/sys/kernel/nmi_watchdog
+ * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
 */
-int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+int proc_dowatchdog(struct ctl_table *table, int write,
-                     void __user *buffer, size_t *length, loff_t *ppos)
+                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        proc_dointvec(table, write, buffer, length, ppos);
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                goto out;
-        if (watchdog_enabled)
+        if (watchdog_enabled && watchdog_thresh)
                watchdog_enable_all_cpus();
        else
                watchdog_disable_all_cpus();
-        return 0;
-}
-int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+out:
-                             void __user *buffer,
+        return ret;
-                             size_t *lenp, loff_t *ppos)
-{
-        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif /* CONFIG_SYSCTL */
@@ -530,13 +544,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (watchdog_prepare_cpu(hotcpu))
+                watchdog_prepare_cpu(hotcpu);
-                        return NOTIFY_BAD;
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                if (watchdog_enable(hotcpu))
+                if (watchdog_enabled)
-                        return NOTIFY_BAD;
+                        watchdog_enable(hotcpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
@@ -549,6 +562,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
        }
+        /*
+         * hardlockup and softlockup are not important enough
+         * to block cpu bring up.  Just always succeed and
+         * rely on printk output to flag problems.
+         */
        return NOTIFY_OK;
 }
@@ -556,22 +575,16 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
-        if (no_watchdog)
-                return 0;
        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        WARN_ON(err == NOTIFY_BAD);
+        WARN_ON(notifier_to_errno(err));
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+        return;
-        return 0;
 }
-early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f77afd939229..0400553f0d04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -42,9 +42,6 @@
 #include <linux/lockdep.h>
 #include <linux/idr.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/workqueue.h>
 #include "workqueue_sched.h"
 enum {
@@ -82,7 +79,9 @@ enum {
        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
-        MAYDAY_INITIAL_TIMEOUT  = HZ / 100,     /* call for help after 10ms */
+        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
+                                                /* call for help after 10ms
+                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
@@ -252,10 +251,15 @@ struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+EXPORT_SYMBOL_GPL(system_freezable_wq);
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
@@ -310,25 +314,15 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
             (cpu) < WORK_CPU_NONE;                                     \
             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
-#ifdef CONFIG_LOCKDEP
-/**
- * in_workqueue_context() - in context of specified workqueue?
- * @wq: the workqueue of interest
- *
- * Checks lockdep state to see if the current task is executing from
- * within a workqueue item.  This function exists only if lockdep is
- * enabled.
- */
-int in_workqueue_context(struct workqueue_struct *wq)
-{
-        return lock_is_held(&wq->lockdep_map);
-}
-#endif
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 static struct debug_obj_descr work_debug_descr;
+static void *work_debug_hint(void *addr)
+{
+        return ((struct work_struct *) addr)->func;
+}
 /*
 * fixup_init is called when:
 * - an active object is initialized
@@ -400,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr work_debug_descr = {
        .name           = "work_struct",
+        .debug_hint     = work_debug_hint,
        .fixup_init     = work_fixup_init,
        .fixup_activate = work_fixup_activate,
        .fixup_free     = work_fixup_free,
@@ -604,7 +599,9 @@ static bool keep_working(struct global_cwq *gcwq)
 {
        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
-        return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
+        return !list_empty(&gcwq->worklist) &&
+                (atomic_read(nr_running) <= 1 ||
+                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 /* Do we need a new worker?  Called from manager. */
@@ -674,7 +671,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 {
        struct worker *worker = kthread_data(task);
-        if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+        if (!(worker->flags & WORKER_NOT_RUNNING))
                atomic_inc(get_gcwq_nr_running(cpu));
 }
@@ -700,7 +697,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
        struct global_cwq *gcwq = get_gcwq(cpu);
        atomic_t *nr_running = get_gcwq_nr_running(cpu);
-        if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
        /* this can only happen on the local cpu */
@@ -781,7 +778,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
        worker->flags &= ~flags;
-        /* if transitioning out of NOT_RUNNING, increment nr_running */
+        /*
+         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
+         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
+         * of multiple flags, not a single flag.
+         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -945,6 +946,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                wake_up_worker(gcwq);
 }
+/*
+ * Test whether @work is being queued from another work executing on the
+ * same workqueue.  This is rather expensive and should only be used from
+ * cold paths.
+ */
+static bool is_chained_work(struct workqueue_struct *wq)
+{
+        unsigned long flags;
+        unsigned int cpu;
+        for_each_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker *worker;
+                struct hlist_node *pos;
+                int i;
+                spin_lock_irqsave(&gcwq->lock, flags);
+                for_each_busy_worker(worker, i, pos, gcwq) {
+                        if (worker->task != current)
+                                continue;
+                        spin_unlock_irqrestore(&gcwq->lock, flags);
+                        /*
+                         * I'm @worker, no locking necessary.  See if @work
+                         * is headed to the same workqueue.
+                         */
+                        return worker->current_cwq->wq == wq;
+                }
+                spin_unlock_irqrestore(&gcwq->lock, flags);
+        }
+        return false;
+}
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
@@ -956,7 +989,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        if (WARN_ON_ONCE(wq->flags & WQ_DYING))
+        /* if dying, only works from the same workqueue are allowed */
+        if (unlikely(wq->flags & WQ_DYING) &&
+            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
        /* determine gcwq to use */
@@ -997,6 +1032,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        /* gcwq determined, get cwq and queue */
        cwq = get_cwq(gcwq->cpu, wq);
+        trace_workqueue_queue_work(cpu, cwq, work);
        BUG_ON(!list_empty(&work->entry));
@@ -1004,6 +1040,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        work_flags = work_color_to_flags(cwq->work_color);
        if (likely(cwq->nr_active < cwq->max_active)) {
+                trace_workqueue_activate_work(work);
                cwq->nr_active++;
                worklist = gcwq_determine_ins_pos(gcwq, cwq);
        } else {
@@ -1254,8 +1291,14 @@ __acquires(&gcwq->lock)
                        return true;
                spin_unlock_irq(&gcwq->lock);
-                /* CPU has come up inbetween, retry migration */
+                /*
+                 * We've raced with CPU hot[un]plug.  Give it a breather
+                 * and retry migration.  cond_resched() is required here;
+                 * otherwise, we might deadlock against cpu_stop trying to
+                 * bring down the CPU on non-preemptive kernel.
+                 */
                cpu_relax();
+                cond_resched();
        }
 }
@@ -1329,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        worker->id = id;
        if (!on_unbound_cpu)
-                worker->task = kthread_create(worker_thread, worker,
+                worker->task = kthread_create_on_node(worker_thread,
-                                              "kworker/%u:%d", gcwq->cpu, id);
+                                                      worker,
+                                                      cpu_to_node(gcwq->cpu),
+                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
                                              "kworker/u:%d", id);
@@ -1679,6 +1724,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
                                                    struct work_struct, entry);
        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+        trace_workqueue_activate_work(work);
        move_linked_works(work, pos, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
        cwq->nr_active++;
@@ -1816,7 +1862,7 @@ __acquires(&gcwq->lock)
        spin_unlock_irq(&gcwq->lock);
        work_clear_pending(work);
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
        f(work);
@@ -2019,6 +2065,15 @@ repeat:
                                move_linked_works(work, scheduled, &n);
                process_scheduled_works(rescuer);
+                /*
+                 * Leave this gcwq.  If keep_working() is %true, notify a
+                 * regular worker; otherwise, we end up with 0 concurrency
+                 * and stalling the execution.
+                 */
+                if (keep_working(gcwq))
+                        wake_up_worker(gcwq);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -2074,7 +2129,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
         * checks and call back into the fixup functions where we
         * might deadlock.
         */
-        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
+        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
@@ -2326,27 +2381,17 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
-/**
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
- * flush_work - block until a work_struct's callback has terminated
+                             bool wait_executing)
- * @work: the work which is to be flushed
- *
- * Returns false if @work has already terminated.
- *
- * It is expected that, prior to calling flush_work(), the caller has
- * arranged for the work to not be requeued, otherwise it doesn't make
- * sense to use this function.
- */
-int flush_work(struct work_struct *work)
 {
        struct worker *worker = NULL;
        struct global_cwq *gcwq;
        struct cpu_workqueue_struct *cwq;
-        struct wq_barrier barr;
        might_sleep();
        gcwq = get_work_gcwq(work);
        if (!gcwq)
-                return 0;
+                return false;
        spin_lock_irq(&gcwq->lock);
        if (!list_empty(&work->entry)) {
@@ -2359,28 +2404,137 @@ int flush_work(struct work_struct *work)
                cwq = get_work_cwq(work);
                if (unlikely(!cwq || gcwq != cwq->gcwq))
                        goto already_gone;
-        } else {
+        } else if (wait_executing) {
                worker = find_worker_executing_work(gcwq, work);
                if (!worker)
                        goto already_gone;
                cwq = worker->current_cwq;
-        }
+        } else
+                goto already_gone;
-        insert_wq_barrier(cwq, &barr, work, worker);
+        insert_wq_barrier(cwq, barr, work, worker);
        spin_unlock_irq(&gcwq->lock);
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        /*
+         * If @max_active is 1 or rescuer is in use, flushing another work
+         * item on the same workqueue may lead to deadlock.  Make sure the
+         * flusher is not running on the same workqueue by verifying write
+         * access.
+         */
+        if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+                lock_map_acquire(&cwq->wq->lockdep_map);
+        else
+                lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_release(&cwq->wq->lockdep_map);
-        wait_for_completion(&barr.done);
+        return true;
-        destroy_work_on_stack(&barr.work);
-        return 1;
 already_gone:
        spin_unlock_irq(&gcwq->lock);
-        return 0;
+        return false;
+}
+/**
+ * flush_work - wait for a work to finish executing the last queueing instance
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution.  This function considers
+ * only the last queueing instance of @work.  If @work has been
+ * enqueued across different CPUs on a non-reentrant workqueue or on
+ * multiple workqueues, @work might still be executing on return on
+ * some of the CPUs from earlier queueing.
+ *
+ * If @work was queued only on a non-reentrant, ordered or unbound
+ * workqueue, @work is guaranteed to be idle on return if it hasn't
+ * been requeued since flush started.
+ *
+ * RETURNS:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        if (start_flush_work(work, &barr, true)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else
+                return false;
 }
 EXPORT_SYMBOL_GPL(flush_work);
+static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
+{
+        struct wq_barrier barr;
+        struct worker *worker;
+        spin_lock_irq(&gcwq->lock);
+        worker = find_worker_executing_work(gcwq, work);
+        if (unlikely(worker))
+                insert_wq_barrier(worker->current_cwq, &barr, work, worker);
+        spin_unlock_irq(&gcwq->lock);
+        if (unlikely(worker)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else
+                return false;
+}
+static bool wait_on_work(struct work_struct *work)
+{
+        bool ret = false;
+        int cpu;
+        might_sleep();
+        lock_map_acquire(&work->lockdep_map);
+        lock_map_release(&work->lockdep_map);
+        for_each_gcwq_cpu(cpu)
+                ret |= wait_on_cpu_work(get_gcwq(cpu), work);
+        return ret;
+}
+/**
+ * flush_work_sync - wait until a work has finished execution
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution.  On return, it's
+ * guaranteed that all queueing instances of @work which happened
+ * before this function is called are finished.  In other words, if
+ * @work hasn't been requeued since this function was called, @work is
+ * guaranteed to be idle on return.
+ *
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work_sync(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        bool pending, waited;
+        /* we'll wait for executions separately, queue barr only if pending */
+        pending = start_flush_work(work, &barr, false);
+        /* wait for executions to finish */
+        waited = wait_on_work(work);
+        /* wait for the pending one */
+        if (pending) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
+        return pending || waited;
+}
+EXPORT_SYMBOL_GPL(flush_work_sync);
 /*
 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
 * so this work can't be re-armed in any way.
@@ -2423,39 +2577,7 @@ static int try_to_grab_pending(struct work_struct *work)
        return ret;
 }
-static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
+static bool __cancel_work_timer(struct work_struct *work,
-{
-        struct wq_barrier barr;
-        struct worker *worker;
-        spin_lock_irq(&gcwq->lock);
-        worker = find_worker_executing_work(gcwq, work);
-        if (unlikely(worker))
-                insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-        spin_unlock_irq(&gcwq->lock);
-        if (unlikely(worker)) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-        }
-}
-static void wait_on_work(struct work_struct *work)
-{
-        int cpu;
-        might_sleep();
-        lock_map_acquire(&work->lockdep_map);
-        lock_map_release(&work->lockdep_map);
-        for_each_gcwq_cpu(cpu)
-                wait_on_cpu_work(get_gcwq(cpu), work);
-}
-static int __cancel_work_timer(struct work_struct *work,
                                struct timer_list* timer)
 {
        int ret;
@@ -2472,42 +2594,81 @@ static int __cancel_work_timer(struct work_struct *work,
 }
 /**
- * cancel_work_sync - block until a work_struct's callback has terminated
+ * cancel_work_sync - cancel a work and wait for it to finish
- * @work: the work which is to be flushed
+ * @work: the work to cancel
 *
- * Returns true if @work was pending.
+ * Cancel @work and wait for its execution to finish.  This function
- *
+ * can be used even if the work re-queues itself or migrates to
- * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * another workqueue.  On return from this function, @work is
- * callback appears to be running, cancel_work_sync() will block until it
+ * guaranteed to be not pending or executing on any CPU.
- * has completed.
- *
- * It is possible to use this function if the work re-queues itself. It can
- * cancel the work even if it migrates to another workqueue, however in that
- * case it only guarantees that work->func() has completed on the last queued
- * workqueue.
 *
- * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * cancel_work_sync(&delayed_work->work) must not be used for
- * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ * delayed_work's.  Use cancel_delayed_work_sync() instead.
 *
- * The caller must ensure that workqueue_struct on which this work was last
+ * The caller must ensure that the workqueue on which @work was last
 * queued can't be destroyed before this function returns.
+ *
+ * RETURNS:
+ * %true if @work was pending, %false otherwise.
 */
-int cancel_work_sync(struct work_struct *work)
+bool cancel_work_sync(struct work_struct *work)
 {
        return __cancel_work_timer(work, NULL);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 /**
- * cancel_delayed_work_sync - reliably kill off a delayed work.
+ * flush_delayed_work - wait for a dwork to finish executing the last queueing
- * @dwork: the delayed work struct
+ * @dwork: the delayed work to flush
+ *
+ * Delayed timer is cancelled and the pending work is queued for
+ * immediate execution.  Like flush_work(), this function only
+ * considers the last queueing instance of @dwork.
+ *
+ * RETURNS:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_delayed_work(struct delayed_work *dwork)
+{
+        if (del_timer_sync(&dwork->timer))
+                __queue_work(raw_smp_processor_id(),
+                             get_work_cwq(&dwork->work)->wq, &dwork->work);
+        return flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+/**
+ * flush_delayed_work_sync - wait for a dwork to finish
+ * @dwork: the delayed work to flush
+ *
+ * Delayed timer is cancelled and the pending work is queued for
+ * execution immediately.  Other than timer handling, its behavior
+ * is identical to flush_work_sync().
+ *
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_delayed_work_sync(struct delayed_work *dwork)
+{
+        if (del_timer_sync(&dwork->timer))
+                __queue_work(raw_smp_processor_id(),
+                             get_work_cwq(&dwork->work)->wq, &dwork->work);
+        return flush_work_sync(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work_sync);
+/**
+ * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
+ * @dwork: the delayed work cancel
 *
- * Returns true if @dwork was pending.
+ * This is cancel_work_sync() for delayed works.
 *
- * It is possible to use this function if @dwork rearms itself via queue_work()
+ * RETURNS:
- * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ * %true if @dwork was pending, %false otherwise.
 */
-int cancel_delayed_work_sync(struct delayed_work *dwork)
+bool cancel_delayed_work_sync(struct delayed_work *dwork)
 {
        return __cancel_work_timer(&dwork->work, &dwork->timer);
 }
@@ -2559,23 +2720,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
 EXPORT_SYMBOL(schedule_delayed_work);
 /**
- * flush_delayed_work - block until a dwork_struct's callback has terminated
- * @dwork: the delayed work which is to be flushed
- *
- * Any timeout is cancelled, and any pending work is run immediately.
- */
-void flush_delayed_work(struct delayed_work *dwork)
-{
-        if (del_timer_sync(&dwork->timer)) {
-                __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
-                             &dwork->work);
-                put_cpu();
-        }
-        flush_work(&dwork->work);
-}
-EXPORT_SYMBOL(flush_delayed_work);
-/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
@@ -2592,13 +2736,15 @@ int schedule_delayed_work_on(int cpu,
 EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
- * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
- * Returns zero on success.
+ * schedule_on_each_cpu() executes @func on each online CPU using the
- * Returns -ve errno on failure.
+ * system workqueue and blocks until all CPUs have completed.
- *
 * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
 */
 int schedule_on_each_cpu(work_func_t func)
 {
@@ -2764,6 +2910,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
        unsigned int cpu;
        /*
+         * Workqueues which may be used during memory reclaim should
+         * have a rescuer to guarantee forward progress.
+         */
+        if (flags & WQ_MEM_RECLAIM)
+                flags |= WQ_RESCUER;
+        /*
         * Unbound workqueues aren't concurrency managed and should be
         * dispatched to workers immediately.
         */
@@ -2828,7 +2981,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
         */
        spin_lock(&workqueue_lock);
-        if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+        if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
                for_each_cwq_cpu(cpu, wq)
                        get_cwq(cpu, wq)->max_active = 0;
@@ -2856,11 +3009,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
+        unsigned int flush_cnt = 0;
        unsigned int cpu;
+        /*
+         * Mark @wq dying and drain all pending works.  Once WQ_DYING is
+         * set, only chain queueing is allowed.  IOW, only currently
+         * pending or running work items on @wq can queue further work
+         * items on it.  @wq is flushed repeatedly until it becomes empty.
+         * The number of flushing is detemined by the depth of chaining and
+         * should be relatively short.  Whine if it takes too long.
+         */
        wq->flags |= WQ_DYING;
+reflush:
        flush_workqueue(wq);
+        for_each_cwq_cpu(cpu, wq) {
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+                        continue;
+                if (++flush_cnt == 10 ||
+                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+                        printk(KERN_WARNING "workqueue %s: flush on "
+                               "destruction isn't complete after %u tries\n",
+                               wq->name, flush_cnt);
+                goto reflush;
+        }
        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
@@ -2916,7 +3093,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
                spin_lock_irq(&gcwq->lock);
-                if (!(wq->flags & WQ_FREEZEABLE) ||
+                if (!(wq->flags & WQ_FREEZABLE) ||
                    !(gcwq->flags & GCWQ_FREEZING))
                        get_cwq(gcwq->cpu, wq)->max_active = max_active;
@@ -3166,7 +3343,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
         * want to get it over with ASAP - spam rescuers, wake up as
         * many idlers as necessary and create new ones till the
         * worklist is empty.  Note that if the gcwq is frozen, there
-         * may be frozen works in freezeable cwqs.  Don't declare
+         * may be frozen works in freezable cwqs.  Don't declare
         * completion while frozen.
         */
        while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3424,9 +3601,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 /**
 * freeze_workqueues_begin - begin freezing workqueues
 *
- * Start freezing workqueues.  After this function returns, all
+ * Start freezing workqueues.  After this function returns, all freezable
- * freezeable workqueues will queue new works to their frozen_works
+ * workqueues will queue new works to their frozen_works list instead of
- * list instead of gcwq->worklist.
+ * gcwq->worklist.
 *
 * CONTEXT:
 * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3452,7 +3629,7 @@ void freeze_workqueues_begin(void)
                list_for_each_entry(wq, &workqueues, list) {
                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                        if (cwq && wq->flags & WQ_FREEZEABLE)
+                        if (cwq && wq->flags & WQ_FREEZABLE)
                                cwq->max_active = 0;
                }
@@ -3463,7 +3640,7 @@ void freeze_workqueues_begin(void)
 }
 /**
- * freeze_workqueues_busy - are freezeable workqueues still busy?
+ * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3472,8 +3649,8 @@ void freeze_workqueues_begin(void)
 * Grabs and releases workqueue_lock.
 *
 * RETURNS:
- * %true if some freezeable workqueues are still busy.  %false if
+ * %true if some freezable workqueues are still busy.  %false if freezing
- * freezing is complete.
+ * is complete.
 */
 bool freeze_workqueues_busy(void)
 {
@@ -3493,7 +3670,7 @@ bool freeze_workqueues_busy(void)
                list_for_each_entry(wq, &workqueues, list) {
                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                        if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
                                continue;
                        BUG_ON(cwq->nr_active < 0);
@@ -3538,7 +3715,7 @@ void thaw_workqueues(void)
                list_for_each_entry(wq, &workqueues, list) {
                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                        if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
                                continue;
                        /* restore max_active and repopulate worklist */
@@ -3612,7 +3789,10 @@ static int __init init_workqueues(void)
        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
-        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+        system_freezable_wq = alloc_workqueue("events_freezable",
+                                              WQ_FREEZABLE, 0);
+        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
+               !system_unbound_wq || !system_freezable_wq);
        return 0;
 }
 early_initcall(init_workqueues);
author	Andrea Bastoni <bastoni@cs.unc.edu>	2011-08-27 09:43:54 -0400
committer	Andrea Bastoni <bastoni@cs.unc.edu>	2011-08-27 10:06:11 -0400
commit	7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch)
tree	5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel
parent	7d754596756240fa918b94cd0c3011c77a638987 (diff)
parent	02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff)