Merge branch 'consolidate-clksrc-i8253' of master.kernel.org:~rmk/linux-2.6-arm into timers/clocksource

Conflicts: arch/ia64/kernel/cyclone.c arch/mips/kernel/i8253.c arch/x86/kernel/i8253.c Reason: Resolve conflicts so further cleanups do not conflict further Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Thomas Gleixner <tglx@linutronix.de> 2011-05-14 06:06:36 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2011-05-14 06:06:36 -0400
commit: a18f22a968de17b29f2310cdb7ba69163e65ec15 (patch)
tree: a7d56d88fad5e444d7661484109758a2f436129e /kernel
parent: a1c57e0fec53defe745e64417eacdbd3618c3e66 (diff)
parent: 798778b8653f64b7b2162ac70eca10367cff6ce8 (diff)
131 files changed, 6586 insertions, 3107 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int	audit_initialized;
 int             audit_enabled;
 int             audit_ever_enabled;
+EXPORT_SYMBOL_GPL(audit_enabled);
 /* Default state when kernel boots without any parameters. */
 static int      audit_default;
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
-        loginuid = NETLINK_CB(skb).loginuid;
+        loginuid = audit_get_loginuid(current);
-        sessionid = NETLINK_CB(skb).sessionid;
+        sessionid = audit_get_sessionid(current);
-        sid  = NETLINK_CB(skb).sid;
+        security_task_getsecid(current, &sid);
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
                        struct audit_chunk *chunk = find_chunk(node);
-                        /* this could be NULL if the watch is dieing else where... */
+                        /* this could be NULL if the watch is dying else where... */
                        struct inode *inode = chunk->mark.i.inode;
                        node->index |= 1U<<31;
                        if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 }
 /* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
 {
-        struct inode *inode = ndp->path.dentry->d_inode;
+        struct inode *inode = path->dentry->d_inode;
        struct audit_parent *parent;
        int ret;
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 }
 /* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-        struct nameidata *ndparent, *ndwatch;
+        struct nameidata nd;
+        struct dentry *d;
        int err;
-        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+        err = kern_path_parent(watch->path, &nd);
-        if (unlikely(!ndparent))
+        if (err)
-                return -ENOMEM;
+                return err;
-        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+        if (nd.last_type != LAST_NORM) {
-        if (unlikely(!ndwatch)) {
+                path_put(&nd.path);
-                kfree(ndparent);
+                return -EINVAL;
-                return -ENOMEM;
        }
-        err = path_lookup(path, LOOKUP_PARENT, ndparent);
+        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-        if (err) {
+        d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-                kfree(ndparent);
+        if (IS_ERR(d)) {
-                kfree(ndwatch);
+                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-                return err;
+                path_put(&nd.path);
+                return PTR_ERR(d);
        }
+        if (d->d_inode) {
-        err = path_lookup(path, 0, ndwatch);
+                /* update watch filter fields */
-        if (err) {
+                watch->dev = d->d_inode->i_sb->s_dev;
-                kfree(ndwatch);
+                watch->ino = d->d_inode->i_ino;
-                ndwatch = NULL;
        }
+        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-        *ndp = ndparent;
+        *parent = nd.path;
-        *ndw = ndwatch;
+        dput(d);
        return 0;
 }
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-        if (ndp) {
-                path_put(&ndp->path);
-                kfree(ndp);
-        }
-        if (ndw) {
-                path_put(&ndw->path);
-                kfree(ndw);
-        }
-}
 /* Associate the given rule with an existing parent.
 * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
        struct audit_watch *watch = krule->watch;
        struct audit_parent *parent;
-        struct nameidata *ndp = NULL, *ndw = NULL;
+        struct path parent_path;
        int h, ret = 0;
        mutex_unlock(&audit_filter_mutex);
        /* Avoid calling path_lookup under audit_filter_mutex. */
-        ret = audit_get_nd(watch->path, &ndp, &ndw);
+        ret = audit_get_nd(watch, &parent_path);
-        if (ret) {
-                /* caller expects mutex locked */
-                mutex_lock(&audit_filter_mutex);
-                goto error;
-        }
+        /* caller expects mutex locked */
        mutex_lock(&audit_filter_mutex);
-        /* update watch filter fields */
+        if (ret)
-        if (ndw) {
+                return ret;
-                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-                watch->ino = ndw->path.dentry->d_inode->i_ino;
-        }
        /* either find an old parent or attach a new one */
-        parent = audit_find_parent(ndp->path.dentry->d_inode);
+        parent = audit_find_parent(parent_path.dentry->d_inode);
        if (!parent) {
-                parent = audit_init_parent(ndp);
+                parent = audit_init_parent(&parent_path);
                if (IS_ERR(parent)) {
                        ret = PTR_ERR(parent);
                        goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
        h = audit_hash_ino((u32)watch->ino);
        *list = &audit_inode_hash[h];
 error:
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
+        path_put(&parent_path);
        return ret;
 }
 void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                int result = 0;
+                u32 sid;
                switch (f->type) {
                case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                        result = audit_comparator(cb->creds.gid, f->op, f->val);
                        break;
                case AUDIT_LOGINUID:
-                        result = audit_comparator(cb->loginuid, f->op, f->val);
+                        result = audit_comparator(audit_get_loginuid(current),
+                                                  f->op, f->val);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
-                        if (f->lsm_rule)
+                        if (f->lsm_rule) {
-                                result = security_audit_rule_match(cb->sid,
+                                security_task_getsecid(current, &sid);
+                                result = security_audit_rule_match(sid,
                                                                   f->type,
                                                                   f->op,
                                                                   f->lsm_rule,
                                                                   NULL);
+                        }
                        break;
                }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..b33513a08beb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 /*
 * to_send and len_sent accounting are very loose estimates.  We aren't
 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
 *
 * why snprintf?  an int is up to 12 digits long.  if we just assumed when
 * logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
 void foo(void)
 {
        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
        /* End of constants */
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 /*
@@ -290,6 +291,60 @@ error:
 }
 /**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+        int ret = security_real_capable(t, &init_user_ns, cap);
+        return (ret == 0);
+}
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+                       struct user_namespace *ns, int cap)
+{
+        int ret = security_real_capable(t, ns, cap);
+        return (ret == 0);
+}
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+        int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+        return (ret == 0);
+}
+/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
@@ -299,17 +354,48 @@ error:
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
-int capable(int cap)
+bool capable(int cap)
+{
+        return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
 {
        if (unlikely(!cap_valid(cap))) {
                printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
                BUG();
        }
-        if (security_capable(current_cred(), cap) == 0) {
+        if (security_capable(ns, current_cred(), cap) == 0) {
                current->flags |= PF_SUPERPRIV;
-                return 1;
+                return true;
        }
-        return 0;
+        return false;
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ *  Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+        return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..25c7eb52de1a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
 };
 /*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
 */
 struct cgroup_event {
        /*
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        /* Update the css_set linked lists if we're using them */
        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list)) {
+        if (!list_empty(&tsk->cg_list))
-                list_del(&tsk->cg_list);
+                list_move(&tsk->cg_list, &newcg->tasks);
-                list_add(&tsk->cg_list, &newcg->tasks);
-        }
        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
        spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
-                list_del(&cgrp->release_list);
+                list_del_init(&cgrp->release_list);
        spin_unlock(&release_list_lock);
        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
-        list_del(&cgrp->sibling);
+        list_del_init(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
        d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        subsys[ss->subsys_id] = NULL;
        /* remove subsystem from rootnode's list of subsystems */
-        list_del(&ss->sibling);
+        list_del_init(&ss->sibling);
        /*
         * disentangle the css from all css_sets attached to the dummytop. as
@@ -4230,20 +4228,8 @@ void cgroup_post_fork(struct task_struct *child)
 */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-        int i;
        struct css_set *cg;
+        int i;
-        if (run_callbacks && need_forkexit_callback) {
-                /*
-                 * modular subsystems can't use callbacks, so no need to lock
-                 * the subsys array
-                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-                        struct cgroup_subsys *ss = subsys[i];
-                        if (ss->exit)
-                                ss->exit(ss, tsk);
-                }
-        }
        /*
         * Unlink from the css_set task list if necessary.
@@ -4253,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        if (!list_empty(&tsk->cg_list)) {
                write_lock(&css_set_lock);
                if (!list_empty(&tsk->cg_list))
-                        list_del(&tsk->cg_list);
+                        list_del_init(&tsk->cg_list);
                write_unlock(&css_set_lock);
        }
@@ -4261,7 +4247,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        task_lock(tsk);
        cg = tsk->cgroups;
        tsk->cgroups = &init_css_set;
+        if (run_callbacks && need_forkexit_callback) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->exit) {
+                                struct cgroup *old_cgrp =
+                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
+                                struct cgroup *cgrp = task_cgroup(tsk, i);
+                                ss->exit(ss, cgrp, old_cgrp, tsk);
+                        }
+                }
+        }
        task_unlock(tsk);
        if (cg)
                put_css_set_taskexit(cg);
 }
@@ -4813,6 +4816,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
        return ret;
 }
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+        struct cgroup *cgrp;
+        struct inode *inode;
+        struct cgroup_subsys_state *css;
+        inode = f->f_dentry->d_inode;
+        /* check in cgroup filesystem dir */
+        if (inode->i_op != &cgroup_dir_inode_operations)
+                return ERR_PTR(-EBADF);
+        if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+                return ERR_PTR(-EINVAL);
+        /* get cgroup */
+        cgrp = __d_cgrp(f->f_dentry);
+        css = cgrp->subsys[id];
+        return css ? css : ERR_PTR(-ENOENT);
+}
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
                                                   struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..38b1d2c1cbe8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
                put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
 }
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+        memset(txc, 0, sizeof(struct timex));
+        if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+                        __get_user(txc->modes, &utp->modes) ||
+                        __get_user(txc->offset, &utp->offset) ||
+                        __get_user(txc->freq, &utp->freq) ||
+                        __get_user(txc->maxerror, &utp->maxerror) ||
+                        __get_user(txc->esterror, &utp->esterror) ||
+                        __get_user(txc->status, &utp->status) ||
+                        __get_user(txc->constant, &utp->constant) ||
+                        __get_user(txc->precision, &utp->precision) ||
+                        __get_user(txc->tolerance, &utp->tolerance) ||
+                        __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+                        __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+                        __get_user(txc->tick, &utp->tick) ||
+                        __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+                        __get_user(txc->jitter, &utp->jitter) ||
+                        __get_user(txc->shift, &utp->shift) ||
+                        __get_user(txc->stabil, &utp->stabil) ||
+                        __get_user(txc->jitcnt, &utp->jitcnt) ||
+                        __get_user(txc->calcnt, &utp->calcnt) ||
+                        __get_user(txc->errcnt, &utp->errcnt) ||
+                        __get_user(txc->stbcnt, &utp->stbcnt))
+                return -EFAULT;
+        return 0;
+}
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+        if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+                        __put_user(txc->modes, &utp->modes) ||
+                        __put_user(txc->offset, &utp->offset) ||
+                        __put_user(txc->freq, &utp->freq) ||
+                        __put_user(txc->maxerror, &utp->maxerror) ||
+                        __put_user(txc->esterror, &utp->esterror) ||
+                        __put_user(txc->status, &utp->status) ||
+                        __put_user(txc->constant, &utp->constant) ||
+                        __put_user(txc->precision, &utp->precision) ||
+                        __put_user(txc->tolerance, &utp->tolerance) ||
+                        __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+                        __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+                        __put_user(txc->tick, &utp->tick) ||
+                        __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+                        __put_user(txc->jitter, &utp->jitter) ||
+                        __put_user(txc->shift, &utp->shift) ||
+                        __put_user(txc->stabil, &utp->stabil) ||
+                        __put_user(txc->jitcnt, &utp->jitcnt) ||
+                        __put_user(txc->calcnt, &utp->calcnt) ||
+                        __put_user(txc->errcnt, &utp->errcnt) ||
+                        __put_user(txc->stbcnt, &utp->stbcnt) ||
+                        __put_user(txc->tai, &utp->tai))
+                return -EFAULT;
+        return 0;
+}
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
                struct timezone __user *tz)
 {
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
        return err;
 }
+long compat_sys_clock_adjtime(clockid_t which_clock,
+                struct compat_timex __user *utp)
+{
+        struct timex txc;
+        mm_segment_t oldfs;
+        int err, ret;
+        err = compat_get_timex(&txc, utp);
+        if (err)
+                return err;
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+        set_fs(oldfs);
+        err = compat_put_timex(utp, &txc);
+        if (err)
+                return err;
+        return ret;
+}
 long compat_sys_clock_getres(clockid_t which_clock,
                struct compat_timespec __user *tp)
 {
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
        struct timex txc;
-        int ret;
+        int err, ret;
-        memset(&txc, 0, sizeof(struct timex));
-        if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+        err = compat_get_timex(&txc, utp);
-                        __get_user(txc.modes, &utp->modes) ||
+        if (err)
-                        __get_user(txc.offset, &utp->offset) ||
+                return err;
-                        __get_user(txc.freq, &utp->freq) ||
-                        __get_user(txc.maxerror, &utp->maxerror) ||
-                        __get_user(txc.esterror, &utp->esterror) ||
-                        __get_user(txc.status, &utp->status) ||
-                        __get_user(txc.constant, &utp->constant) ||
-                        __get_user(txc.precision, &utp->precision) ||
-                        __get_user(txc.tolerance, &utp->tolerance) ||
-                        __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-                        __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-                        __get_user(txc.tick, &utp->tick) ||
-                        __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-                        __get_user(txc.jitter, &utp->jitter) ||
-                        __get_user(txc.shift, &utp->shift) ||
-                        __get_user(txc.stabil, &utp->stabil) ||
-                        __get_user(txc.jitcnt, &utp->jitcnt) ||
-                        __get_user(txc.calcnt, &utp->calcnt) ||
-                        __get_user(txc.errcnt, &utp->errcnt) ||
-                        __get_user(txc.stbcnt, &utp->stbcnt))
-                return -EFAULT;
        ret = do_adjtimex(&txc);
-        if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+        err = compat_put_timex(utp, &txc);
-                        __put_user(txc.modes, &utp->modes) ||
+        if (err)
-                        __put_user(txc.offset, &utp->offset) ||
+                return err;
-                        __put_user(txc.freq, &utp->freq) ||
-                        __put_user(txc.maxerror, &utp->maxerror) ||
-                        __put_user(txc.esterror, &utp->esterror) ||
-                        __put_user(txc.status, &utp->status) ||
-                        __put_user(txc.constant, &utp->constant) ||
-                        __put_user(txc.precision, &utp->precision) ||
-                        __put_user(txc.tolerance, &utp->tolerance) ||
-                        __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-                        __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-                        __put_user(txc.tick, &utp->tick) ||
-                        __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-                        __put_user(txc.jitter, &utp->jitter) ||
-                        __put_user(txc.shift, &utp->shift) ||
-                        __put_user(txc.stabil, &utp->stabil) ||
-                        __put_user(txc.jitcnt, &utp->jitcnt) ||
-                        __put_user(txc.calcnt, &utp->calcnt) ||
-                        __put_user(txc.errcnt, &utp->errcnt) ||
-                        __put_user(txc.stbcnt, &utp->stbcnt) ||
-                        __put_user(txc.tai, &utp->tai))
-                ret = -EFAULT;
        return ret;
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
 #else /* #if CONFIG_HOTPLUG_CPU */
 static void cpu_hotplug_begin(void) {}
 static void cpu_hotplug_done(void) {}
-#endif  /* #esle #if CONFIG_HOTPLUG_CPU */
+#endif  /* #else #if CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
        BUG_ON(cpu_notify(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
        return 0;
 }
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                return -EINVAL;
        cpu_hotplug_begin();
        err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
        if (err) {
                nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
        if (ret) {
                nr_calls--;
-                printk("%s: attempt to bring up CPU %u failed\n",
+                printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
                                __func__, cpu);
                goto out_notify;
        }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
        if (cpumask_empty(frozen_cpus))
                goto out;
-        printk("Enabling non-boot CPUs ...\n");
+        printk(KERN_INFO "Enabling non-boot CPUs ...\n");
        arch_enable_nonboot_cpus_begin();
        for_each_cpu(cpu, frozen_cpus) {
                error = _cpu_up(cpu, 1);
                if (!error) {
-                        printk("CPU%d is up\n", cpu);
+                        printk(KERN_INFO "CPU%d is up\n", cpu);
                        continue;
                }
                printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
 */
 /* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x)       [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x)       [x+1][0] = (1UL << (x))
 #define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
 #define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
 #define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        static nodemask_t newmems;      /* protected by cgroup_mutex */
-        if (!newmems)
-                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, newmems);
+        guarantee_online_mems(cs, &newmems);
-        cpuset_change_task_nodemask(p, newmems);
-        NODEMASK_FREE(newmems);
+        cpuset_change_task_nodemask(p, &newmems);
        mm = get_task_mm(p);
        if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+        static nodemask_t to;           /* protected by cgroup_mutex */
-        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-        if (from == NULL || to == NULL)
-                goto alloc_fail;
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
        } else {
                guarantee_online_cpus(cs, cpus_attach);
        }
-        guarantee_online_mems(cs, to);
+        guarantee_online_mems(cs, &to);
        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, to, cs);
+        cpuset_attach_task(tsk, &to, cs);
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, to, cs);
+                        cpuset_attach_task(c, &to, cs);
                }
                rcu_read_unlock();
        }
        /* change mm; only needs to be done once even if threadgroup */
-        *from = oldcs->mems_allowed;
+        to = cs->mems_allowed;
-        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, to);
+                mpol_rebind_mm(mm, &to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, from, to);
+                        cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
                mmput(mm);
        }
-alloc_fail:
-        NODEMASK_FREE(from);
-        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1575,8 +1561,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                return -ENODEV;
        trialcs = alloc_trial_cpuset(cs);
-        if (!trialcs)
+        if (!trialcs) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        switch (cft->private) {
        case FILE_CPULIST:
@@ -1591,6 +1579,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
        }
        free_trial_cpuset(trialcs);
+out:
        cgroup_unlock();
        return retval;
 }
@@ -1607,34 +1596,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 * across a page fault.
 */
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-        int ret;
+        size_t count;
        mutex_lock(&callback_mutex);
-        ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+        count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
        mutex_unlock(&callback_mutex);
-        return ret;
+        return count;
 }
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        size_t count;
-        int retval;
-        if (mask == NULL)
-                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        *mask = cs->mems_allowed;
+        count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
        mutex_unlock(&callback_mutex);
-        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        return count;
-        NODEMASK_FREE(mask);
-        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1859,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
        cs = cgroup_cs(cgroup);
        parent_cs = cgroup_cs(parent);
+        mutex_lock(&callback_mutex);
        cs->mems_allowed = parent_cs->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+        mutex_unlock(&callback_mutex);
        return;
 }
@@ -2063,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        static nodemask_t oldmems;      /* protected by cgroup_mutex */
-        if (oldmems == NULL)
-                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2083,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                *oldmems = cp->mems_allowed;
+                oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
@@ -2099,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, oldmems, NULL);
+                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
-        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2144,19 +2123,16 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
-        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        static nodemask_t oldmems;      /* protected by cgroup_mutex */
-        if (oldmems == NULL)
-                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-                *oldmems = top_cpuset.mems_allowed;
+                oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
                break;
        case MEM_OFFLINE:
                /*
@@ -2170,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        }
        cgroup_unlock();
-        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+        char *end;
+        if (!arg)
+                return -EINVAL;
+        elfcorehdr_addr = memparse(arg, &end);
+        return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6c..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
 static struct thread_group_cred init_tgcred = {
        .usage  = ATOMIC_INIT(2),
        .tgid   = 0,
-        .lock   = SPIN_LOCK_UNLOCKED,
+        .lock   = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
 };
 #endif
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 }
 EXPORT_SYMBOL(set_create_files_as);
+struct user_namespace *current_user_ns(void)
+{
+        return _current_user_ns();
+}
+EXPORT_SYMBOL(current_user_ns);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
        /*
         * For single stepping, try to only enter on the processor
-         * that was single stepping.  To gaurd against a deadlock, the
+         * that was single stepping.  To guard against a deadlock, the
         * kernel will only try for the value of sstep_tries before
         * giving up and continuing on.
         */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
        put_packet(remcom_out_buffer);
        return 0;
 }
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+        unsigned char checksum, ch, buffer[3];
+        int loop;
+        buffer[0] = 'W';
+        buffer[1] = hex_asc_hi(status);
+        buffer[2] = hex_asc_lo(status);
+        dbg_io_ops->write_char('$');
+        checksum = 0;
+        for (loop = 0; loop < 3; loop++) {
+                ch = buffer[loop];
+                checksum += ch;
+                dbg_io_ops->write_char(ch);
+        }
+        dbg_io_ops->write_char('#');
+        dbg_io_ops->write_char(hex_asc_hi(checksum));
+        dbg_io_ops->write_char(hex_asc_lo(checksum));
+        /* make sure the output is flushed, lest the bootloader clobber it */
+        dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bd3e8e29caa3..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
 static kdbtab_t *kdb_commands;
 #define KDB_BASE_CMD_MAX 50
 static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[50];
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
 #define for_each_kdbcmd(cmd, num)                                       \
        for ((cmd) = kdb_base_commands, (num) = 0;                      \
             num < kdb_max_commands;                                    \
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
 *      symbol name, and offset to the caller.
 *
 *      The argument may consist of a numeric value (decimal or
- *      hexidecimal), a symbol name, a register name (preceeded by the
+ *      hexidecimal), a symbol name, a register name (preceded by the
 *      percent sign), an environment variable with a numeric value
- *      (preceeded by a dollar sign) or a simple arithmetic expression
+ *      (preceded by a dollar sign) or a simple arithmetic expression
 *      consisting of a symbol name, +/-, and a numeric constant value
 *      (offset).
 * Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
 *      error           The hardware-defined error code
 *      reason2         kdb's current reason code.
 *                      Initially error but can change
- *                      acording to kdb state.
+ *                      according to kdb state.
 *      db_result       Result code from break or debug point.
 *      regs            The exception frame at time of fault/breakpoint.
 *                      should always be valid.
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
          "Send a signal to a process", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("summary", kdb_summary, "",
          "Summarize the system", 4, KDB_REPEAT_NONE);
-        kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+        kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
          "Display per_cpu variables", 3, KDB_REPEAT_NONE);
        kdb_register_repeat("grephelp", kdb_grep_help, "",
          "Display help on | grep", 0, KDB_REPEAT_NONE);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
 *      Mask for process state.
 * Notes:
 *      The mask folds data from several sources into a single long value, so
- *      be carefull not to overlap the bits.  TASK_* bits are in the LSB,
+ *      be careful not to overlap the bits.  TASK_* bits are in the LSB,
 *      special cases like UNRUNNABLE are in the MSB.  As of 2.6.10-rc1 there
 *      is no overlap between TASK_* and EXIT_* but that may not always be
 *      true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..8dd874181542 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /* Let father know we died
         *
         * Thread signals are configurable, but you aren't going to use
-         * that to send signals to arbitary processes.
+         * that to send signals to arbitrary processes.
         * That stops right now.
         *
         * If the parent exec id doesn't match the exec id we saved
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
        profile_task_exit(tsk);
        WARN_ON(atomic_read(&tsk->fs_excl));
+        WARN_ON(blk_needs_flush_plug(tsk));
        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
@@ -1015,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
-        flush_ptrace_hw_breakpoint(tsk);
+        ptrace_put_breakpoints(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/kthread.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
 }
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define alloc_task_struct_node(node)           \
-# define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
+                kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk)                  \
+                kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
 #endif
 #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+                                                  int node)
 {
 #ifdef CONFIG_DEBUG_STACK_USAGE
        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
 #else
        gfp_t mask = GFP_KERNEL;
 #endif
-        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+        struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+        return page ? page_address(page) : NULL;
 }
 static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk)
        if (!profile_handoff_task(tsk))
                free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);
 /*
 * macro override instead of weak attribute alias, to workaround
@@ -248,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        struct task_struct *tsk;
        struct thread_info *ti;
        unsigned long *stackend;
+        int node = tsk_fork_get_node(orig);
        int err;
        prepare_to_copy(orig);
-        tsk = alloc_task_struct();
+        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
-        ti = alloc_thread_info(tsk);
+        ti = alloc_thread_info_node(tsk, node);
        if (!ti) {
                free_task_struct(tsk);
                return NULL;
@@ -1180,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                pid = alloc_pid(p->nsproxy->pid_ns);
                if (!pid)
                        goto bad_fork_cleanup_io;
-                if (clone_flags & CLONE_NEWPID) {
-                        retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
-                        if (retval < 0)
-                                goto bad_fork_free_pid;
-                }
        }
        p->pid = pid_nr(pid);
@@ -1204,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+        p->plug = NULL;
+#endif
 #ifdef CONFIG_FUTEX
        p->robust_list = NULL;
 #ifdef CONFIG_COMPAT
@@ -1289,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                tracehook_finish_clone(p, clone_flags, trace);
                if (thread_group_leader(p)) {
-                        if (clone_flags & CLONE_NEWPID)
+                        if (is_child_reaper(pid))
                                p->nsproxy->pid_ns->child_reaper = p;
                        p->signal->leader_pid = pid;
@@ -1512,38 +1516,24 @@ void __init proc_caches_init(void)
 }
 /*
- * Check constraints on flags passed to the unshare system call and
+ * Check constraints on flags passed to the unshare system call.
- * force unsharing of additional process context as appropriate.
 */
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
 {
+        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+                return -EINVAL;
        /*
-         * If unsharing a thread from a thread group, must also
+         * Not implemented, but pretend it works if there is nothing to
-         * unshare vm.
+         * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
-         */
+         * needs to unshare vm.
-        if (*flags_ptr & CLONE_THREAD)
-                *flags_ptr |= CLONE_VM;
-        /*
-         * If unsharing vm, must also unshare signal handlers.
-         */
-        if (*flags_ptr & CLONE_VM)
-                *flags_ptr |= CLONE_SIGHAND;
-        /*
-         * If unsharing namespace, must also unshare filesystem information.
         */
-        if (*flags_ptr & CLONE_NEWNS)
+        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
-                *flags_ptr |= CLONE_FS;
+                /* FIXME: get_task_mm() increments ->mm_users */
-}
+                if (atomic_read(&current->mm->mm_users) > 1)
+                        return -EINVAL;
-/*
+        }
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
-        if (unshare_flags & CLONE_THREAD)
-                return -EINVAL;
        return 0;
 }
@@ -1570,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 /*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
-        struct sighand_struct *sigh = current->sighand;
-        if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-                return -EINVAL;
-        else
-                return 0;
-}
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
-        struct mm_struct *mm = current->mm;
-        if ((unshare_flags & CLONE_VM) &&
-            (mm && atomic_read(&mm->mm_users) > 1)) {
-                return -EINVAL;
-        }
-        return 0;
-}
-/*
 * Unshare file descriptor table if it is being shared
 */
 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 */
 SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
-        int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-        struct sighand_struct *new_sigh = NULL;
-        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
+        int err;
-        check_unshare_flags(&unshare_flags);
+        err = check_unshare_flags(unshare_flags);
+        if (err)
-        /* Return -EINVAL for all unsupported flags */
-        err = -EINVAL;
-        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
                goto bad_unshare_out;
        /*
+         * If unsharing namespace, must also unshare filesystem information.
+         */
+        if (unshare_flags & CLONE_NEWNS)
+                unshare_flags |= CLONE_FS;
+        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
-        if ((err = unshare_thread(unshare_flags)))
-                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
-                goto bad_unshare_cleanup_thread;
+                goto bad_unshare_out;
-        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
-                goto bad_unshare_cleanup_fs;
-        if ((err = unshare_vm(unshare_flags, &new_mm)))
-                goto bad_unshare_cleanup_sigh;
        if ((err = unshare_fd(unshare_flags, &new_fd)))
-                goto bad_unshare_cleanup_vm;
+                goto bad_unshare_cleanup_fs;
        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                        new_fs)))
                goto bad_unshare_cleanup_fd;
-        if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        spin_unlock(&fs->lock);
                }
-                if (new_mm) {
-                        mm = current->mm;
-                        active_mm = current->active_mm;
-                        current->mm = new_mm;
-                        current->active_mm = new_mm;
-                        if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-                                atomic_dec(&mm->oom_disable_count);
-                                atomic_inc(&new_mm->oom_disable_count);
-                        }
-                        activate_mm(active_mm, new_mm);
-                        new_mm = mm;
-                }
                if (new_fd) {
                        fd = current->files;
                        current->files = new_fd;
@@ -1718,20 +1659,10 @@ bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
-        if (new_mm)
-                mmput(new_mm);
-bad_unshare_cleanup_sigh:
-        if (new_sigh)
-                if (atomic_dec_and_test(&new_sigh->count))
-                        kmem_cache_free(sighand_cachep, new_sigh);
 bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
 bad_unshare_out:
        return err;
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
        return NULL;
 }
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+                                      u32 uval, u32 newval)
 {
-        u32 curval;
+        int ret;
        pagefault_disable();
-        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
        pagefault_enable();
-        return curval;
+        return ret;
 }
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct task_struct *task, int set_waiters)
 {
        int lock_taken, ret, ownerdied = 0;
-        u32 uval, newval, curval;
+        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 retry:
        ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
         * (by doing a 0 -> TID atomic cmpxchg), while holding all
         * the locks. It will most likely not succeed.
         */
-        newval = task_pid_vnr(task);
+        newval = vpid;
        if (set_waiters)
                newval |= FUTEX_WAITERS;
-        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
-        if (unlikely(curval == -EFAULT))
                return -EFAULT;
        /*
         * Detect deadlocks.
         */
-        if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
                return -EDEADLK;
        /*
@@ -723,14 +722,12 @@ retry:
         */
        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
                /* Keep the OWNER_DIED bit */
-                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+                newval = (curval & ~FUTEX_TID_MASK) | vpid;
                ownerdied = 0;
                lock_taken = 1;
        }
-        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
-        if (unlikely(curval == -EFAULT))
                return -EFAULT;
        if (unlikely(curval != uval))
                goto retry;
@@ -775,6 +772,24 @@ retry:
        return ret;
 }
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q:  The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+        struct futex_hash_bucket *hb;
+        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+            || WARN_ON(plist_node_empty(&q->list)))
+                return;
+        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+        plist_del(&q->list, &hb->chain);
+}
 /*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
         */
        get_task_struct(p);
-        plist_del(&q->list, &q->list.plist);
+        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as
         * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                if (curval == -EFAULT)
                        ret = -EFAULT;
                else if (curval != uval)
                        ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
         * There is no waiter, so we unlock the futex. The owner died
         * bit has not to be preserved here. We are the owner:
         */
-        oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
+        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+                return -EFAULT;
-        if (oldval == -EFAULT)
-                return oldval;
        if (oldval != uval)
                return -EAGAIN;
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
                plist_del(&q->list, &hb1->chain);
                plist_add(&q->list, &hb2->chain);
                q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-                q->list.plist.spinlock = &hb2->lock;
-#endif
        }
        get_futex_key_refs(key2);
        q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
        get_futex_key_refs(key);
        q->key = *key;
-        WARN_ON(plist_node_empty(&q->list));
+        __unqueue_futex(q);
-        plist_del(&q->list, &q->list.plist);
        WARN_ON(!q->rt_waiter);
        q->rt_waiter = NULL;
        q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.spinlock = &hb->lock;
-#endif
        wake_up_state(q->task, TASK_NORMAL);
 }
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        prio = min(current->normal_prio, MAX_RT_PRIO);
        plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.spinlock = &hb->lock;
-#endif
        plist_add(&q->list, &hb->chain);
        q->task = current;
        spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
                        spin_unlock(lock_ptr);
                        goto retry;
                }
-                WARN_ON(plist_node_empty(&q->list));
+                __unqueue_futex(q);
-                plist_del(&q->list, &q->list.plist);
                BUG_ON(q->pi_state);
@@ -1525,8 +1525,7 @@ retry:
 static void unqueue_me_pi(struct futex_q *q)
        __releases(q->lock_ptr)
 {
-        WARN_ON(plist_node_empty(&q->list));
+        __unqueue_futex(q);
-        plist_del(&q->list, &q->list.plist);
        BUG_ON(!q->pi_state);
        free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        /*
         * We are here either because we stole the rtmutex from the
-         * pending owner or we are the pending owner which failed to
+         * previous highest priority waiter or we are the highest priority
-         * get the rtmutex. We have to replace the pending owner TID
+         * waiter but failed to get the rtmutex the first time.
-         * in the user space variable. This must be atomic as we have
+         * We have to replace the newowner TID in the user space variable.
-         * to preserve the owner died bit here.
+         * This must be atomic as we have to preserve the owner died bit here.
         *
         * Note: We write the user space value _before_ changing the pi_state
         * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
        while (1) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
-                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                if (curval == -EFAULT)
                        goto handle_fault;
                if (curval == uval)
                        break;
@@ -1608,8 +1605,8 @@ retry:
        /*
         * To handle the page fault we need to drop the hash bucket
-         * lock here. That gives the other task (either the pending
+         * lock here. That gives the other task (either the highest priority
-         * owner itself or the task which stole the rtmutex) the
+         * waiter itself or the task which stole the rtmutex) the
         * chance to try the fixup of the pi_state. So once we are
         * back from handling the fault we need to check the pi_state
         * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                /*
                 * pi_state is incorrect, some other task did a lock steal and
                 * we returned due to timeout or signal without taking the
-                 * rt_mutex. Too late. We can access the rt_mutex_owner without
+                 * rt_mutex. Too late.
-                 * locking, as the other task is now blocked on the hash bucket
-                 * lock. Fix the state up.
                 */
+                raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+                if (!owner)
+                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+                raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
        /*
         * Paranoia check. If we did not take the lock, then we should not be
-         * the owner, nor the pending owner, of the rt_mutex.
+         * the owner of the rt_mutex.
         */
        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
         *
         * The basic logical guarantee of a futex is that it blocks ONLY
         * if cond(var) is known to be true at the time of blocking, for
-         * any cond.  If we queued after testing *uaddr, that would open
+         * any cond.  If we locked the hash-bucket after testing *uaddr, that
-         * a race condition where we could block indefinitely with
+         * would open a race condition where we could block indefinitely with
         * cond(var) false, which would violate the guarantee.
         *
-         * A consequence is that futex_wait() can return zero and absorb
+         * On the other hand, we insert q and release the hash-bucket only
-         * a wakeup when *uaddr != val on entry to the syscall.  This is
+         * after testing *uaddr.  This guarantees that futex_wait() will NOT
-         * rare, but normal.
+         * absorb a wakeup if *uaddr does not match the desired values
+         * while the syscall executes.
         */
 retry:
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -1886,7 +1886,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = flags;
+        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
        ret = -ERESTART_RESTARTBLOCK;
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-        u32 uval;
        struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
+        u32 uval, vpid = task_pid_vnr(current);
        int ret;
 retry:
@@ -2057,7 +2057,7 @@ retry:
        /*
         * We release only a lock we actually own:
         */
-        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+        if ((uval & FUTEX_TID_MASK) != vpid)
                return -EPERM;
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
         * again. If it succeeds then we can return without waking
         * anyone else up:
         */
-        if (!(uval & FUTEX_OWNER_DIED))
+        if (!(uval & FUTEX_OWNER_DIED) &&
-                uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
+            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
-        if (unlikely(uval == -EFAULT))
                goto pi_faulted;
        /*
         * Rare case: we managed to release the lock atomically,
         * no need to wake anyone else up:
         */
-        if (unlikely(uval == task_pid_vnr(current)))
+        if (unlikely(uval == vpid))
                goto out_unlock;
        /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 * We were woken prior to requeue by a timeout or a signal.
                 * Unqueue the futex_q and determine which it was.
                 */
-                plist_del(&q->list, &q->list.plist);
+                plist_del(&q->list, &hb->chain);
                /* Handle spurious wakeups gracefully */
                ret = -EWOULDBLOCK;
@@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                    !capable(CAP_SYS_PTRACE))
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->robust_list;
                rcu_read_unlock();
        }
@@ -2463,11 +2469,20 @@ retry:
                 * userspace.
                 */
                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+                /*
+                 * We are not holding a lock here, but we want to have
-                if (nval == -EFAULT)
+                 * the pagefault_disable/enable() protection because
-                        return -1;
+                 * we want to handle the fault gracefully. If the
+                 * access fails we try to fault in the futex with R/W
+                 * verification via get_user_pages. get_user() above
+                 * does not guarantee R/W access. If that fails we
+                 * give up and leave the futex locked.
+                 */
+                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+                        if (fault_in_user_writeable(uaddr))
+                                return -1;
+                        goto retry;
+                }
                if (nval != uval)
                        goto retry;
@@ -2678,8 +2693,7 @@ static int __init futex_init(void)
         * implementation, the non-functional ones will return
         * -ENOSYS.
         */
-        curval = cmpxchg_futex_value_locked(NULL, 0, 0);
+        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
-        if (curval == -EFAULT)
                futex_cmpxchg_enabled = 1;
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                    !capable(CAP_SYS_PTRACE))
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->compat_robust_list;
                rcu_read_unlock();
        }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!capable(CAP_SETGID))
+        if (!nsown_capable(CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..87fdb3f8db14 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
 /*
 * The timer bases:
 *
- * Note: If we want to add new timer bases, we have to skip the two
+ * There are more clockids then hrtimer bases. Thus, we index
- * clock ids captured by the cpu-timers. We do this by holding empty
+ * into the timer bases by the hrtimer_base_type enum. When trying
- * entries rather than doing math adjustment of the clock ids.
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
- * This ensures that we capture erroneous accesses to these clock ids
+ * is used to convert from clockid to the proper hrtimer_base_type.
- * rather than moving them into the range of valid clock id's.
 */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
@@ -74,30 +73,43 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
                        .get_time = &ktime_get,
                        .resolution = KTIME_LOW_RES,
                },
+                {
+                        .index = CLOCK_BOOTTIME,
+                        .get_time = &ktime_get_boottime,
+                        .resolution = KTIME_LOW_RES,
+                },
        }
 };
+static int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
+        [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
+        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
+};
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+        return hrtimer_clock_to_base_table[clock_id];
+}
 /*
 * Get the coarse grained time at the softirq based on xtime and
 * wall_to_monotonic.
 */
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
-        ktime_t xtim, tomono;
+        ktime_t xtim, mono, boot;
-        struct timespec xts, tom;
+        struct timespec xts, tom, slp;
-        unsigned long seq;
-        do {
+        get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
-                seq = read_seqbegin(&xtime_lock);
-                xts = __current_kernel_time();
-                tom = __get_wall_to_monotonic();
-        } while (read_seqretry(&xtime_lock, seq));
        xtim = timespec_to_ktime(xts);
-        tomono = timespec_to_ktime(tom);
+        mono = ktime_add(xtim, timespec_to_ktime(tom));
-        base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
+        boot = ktime_add(mono, timespec_to_ktime(slp));
-        base->clock_base[CLOCK_MONOTONIC].softirq_time =
+        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-                ktime_add(xtim, tomono);
+        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+        base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
 }
 /*
@@ -184,10 +196,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
        struct hrtimer_cpu_base *new_cpu_base;
        int this_cpu = smp_processor_id();
        int cpu = hrtimer_get_target(this_cpu, pinned);
+        int basenum = hrtimer_clockid_to_base(base->index);
 again:
        new_cpu_base = &per_cpu(hrtimer_bases, cpu);
-        new_base = &new_cpu_base->clock_base[base->index];
+        new_base = &new_cpu_base->clock_base[basenum];
        if (base != new_base) {
                /*
@@ -334,6 +347,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
 static struct debug_obj_descr hrtimer_debug_descr;
+static void *hrtimer_debug_hint(void *addr)
+{
+        return ((struct hrtimer *) addr)->function;
+}
 /*
 * fixup_init is called when:
 * - an active object is initialized
@@ -393,6 +411,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr hrtimer_debug_descr = {
        .name           = "hrtimer",
+        .debug_hint     = hrtimer_debug_hint,
        .fixup_init     = hrtimer_fixup_init,
        .fixup_activate = hrtimer_fixup_activate,
        .fixup_free     = hrtimer_fixup_free,
@@ -611,24 +630,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base;
-        struct timespec realtime_offset, wtm;
+        struct timespec realtime_offset, wtm, sleep;
-        unsigned long seq;
        if (!hrtimer_hres_active())
                return;
-        do {
+        get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
-                seq = read_seqbegin(&xtime_lock);
+                                                        &sleep);
-                wtm = __get_wall_to_monotonic();
-        } while (read_seqretry(&xtime_lock, seq));
        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
        base = &__get_cpu_var(hrtimer_bases);
        /* Adjust CLOCK_REALTIME offset */
        raw_spin_lock(&base->lock);
-        base->clock_base[CLOCK_REALTIME].offset =
+        base->clock_base[HRTIMER_BASE_REALTIME].offset =
                timespec_to_ktime(realtime_offset);
+        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+                timespec_to_ktime(sleep);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
@@ -673,14 +691,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 }
 /*
- * Initialize the high resolution related parts of a hrtimer
- */
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
-{
-}
-/*
 * When High resolution timers are active, try to reprogram. Note, that in case
 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -725,8 +735,9 @@ static int hrtimer_switch_to_hres(void)
                return 0;
        }
        base->hres_active = 1;
-        base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
+        base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
-        base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+        base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
+        base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
@@ -750,7 +761,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
@@ -1121,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
 {
        struct hrtimer_cpu_base *cpu_base;
+        int base;
        memset(timer, 0, sizeof(struct hrtimer));
@@ -1129,8 +1140,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
                clock_id = CLOCK_MONOTONIC;
-        timer->base = &cpu_base->clock_base[clock_id];
+        base = hrtimer_clockid_to_base(clock_id);
-        hrtimer_init_timer_hres(timer);
+        timer->base = &cpu_base->clock_base[base];
        timerqueue_init(&timer->node);
 #ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1176,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
        struct hrtimer_cpu_base *cpu_base;
+        int base = hrtimer_clockid_to_base(which_clock);
        cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-        *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+        *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
        return 0;
 }
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..c574f9a12c48 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
+# Select this to activate the generic irq options below
 config HAVE_GENERIC_HARDIRQS
-        def_bool n
+        bool
 if HAVE_GENERIC_HARDIRQS
 menu "IRQ subsystem"
@@ -9,28 +10,47 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
       def_bool y
-# Select this to disable the deprecated stuff
-config GENERIC_HARDIRQS_NO_DEPRECATED
-       def_bool n
 # Options selectable by the architecture code
+# Make sparse irq Kconfig switch below available
 config HAVE_SPARSE_IRQ
-       def_bool n
+       bool
+# Enable the generic irq autoprobe mechanism
 config GENERIC_IRQ_PROBE
-        def_bool n
+        bool
+# Use the generic /proc/interrupts implementation
+config GENERIC_IRQ_SHOW
+       bool
+# Print level/edge extra information
+config GENERIC_IRQ_SHOW_LEVEL
+       bool
+# Support for delayed migration from interrupt context
 config GENERIC_PENDING_IRQ
-        def_bool n
+        bool
+# Alpha specific irq affinity mechanism
 config AUTO_IRQ_AFFINITY
-       def_bool n
+       bool
-config IRQ_PER_CPU
-       def_bool n
+# Tasklet based software resend for pending interrupts on enable_irq()
 config HARDIRQS_SW_RESEND
-       def_bool n
+       bool
+# Preflow handler support for fasteoi (sparc64)
+config IRQ_PREFLOW_FASTEOI
+       bool
+# Edge style eoi based handler (cell)
+config IRQ_EDGE_EOI_HANDLER
+       bool
+# Support forced irq threading
+config IRQ_FORCED_THREADING
+       bool
 config SPARSE_IRQ
        bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
 /*
 * Autodetection depends on the fact that any interrupt that
 * comes in on to an unassigned handler will get stuck with
- * "IRQ_WAITING" cleared and the interrupt disabled.
+ * "IRQS_WAITING" cleared and the interrupt disabled.
 */
 static DEFINE_MUTEX(probing_active);
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
 {
        struct irq_desc *desc;
        unsigned long mask = 0;
-        unsigned int status;
        int i;
        /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
         */
        for_each_irq_desc_reverse(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+                if (!desc->action && irq_settings_can_probe(desc)) {
-                        /*
-                         * An old-style architecture might still have
-                         * the handle_bad_irq handler there:
-                         */
-                        compat_irq_chip_set_default_handler(desc);
                        /*
                         * Some chips need to know about probing in
                         * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
                        if (desc->irq_data.chip->irq_set_type)
                                desc->irq_data.chip->irq_set_type(&desc->irq_data,
                                                         IRQ_TYPE_PROBE);
-                        desc->irq_data.chip->irq_startup(&desc->irq_data);
+                        irq_startup(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -75,10 +68,10 @@ unsigned long probe_irq_on(void)
         */
        for_each_irq_desc_reverse(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+                if (!desc->action && irq_settings_can_probe(desc)) {
-                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+                        desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-                        if (desc->irq_data.chip->irq_startup(&desc->irq_data))
+                        if (irq_startup(desc))
-                                desc->status |= IRQ_PENDING;
+                                desc->istate |= IRQS_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -93,13 +86,12 @@ unsigned long probe_irq_on(void)
         */
        for_each_irq_desc(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                status = desc->status;
-                if (status & IRQ_AUTODETECT) {
+                if (desc->istate & IRQS_AUTODETECT) {
                        /* It triggered already - consider it spurious. */
-                        if (!(status & IRQ_WAITING)) {
+                        if (!(desc->istate & IRQS_WAITING)) {
-                                desc->status = status & ~IRQ_AUTODETECT;
+                                desc->istate &= ~IRQS_AUTODETECT;
-                                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+                                irq_shutdown(desc);
                        } else
                                if (i < 32)
                                        mask |= 1 << i;
@@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
 */
 unsigned int probe_irq_mask(unsigned long val)
 {
-        unsigned int status, mask = 0;
+        unsigned int mask = 0;
        struct irq_desc *desc;
        int i;
        for_each_irq_desc(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                status = desc->status;
+                if (desc->istate & IRQS_AUTODETECT) {
+                        if (i < 16 && !(desc->istate & IRQS_WAITING))
-                if (status & IRQ_AUTODETECT) {
-                        if (i < 16 && !(status & IRQ_WAITING))
                                mask |= 1 << i;
-                        desc->status = status & ~IRQ_AUTODETECT;
+                        desc->istate &= ~IRQS_AUTODETECT;
-                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+                        irq_shutdown(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val)
 {
        int i, irq_found = 0, nr_of_irqs = 0;
        struct irq_desc *desc;
-        unsigned int status;
        for_each_irq_desc(i, desc) {
                raw_spin_lock_irq(&desc->lock);
-                status = desc->status;
-                if (status & IRQ_AUTODETECT) {
+                if (desc->istate & IRQS_AUTODETECT) {
-                        if (!(status & IRQ_WAITING)) {
+                        if (!(desc->istate & IRQS_WAITING)) {
                                if (!nr_of_irqs)
                                        irq_found = i;
                                nr_of_irqs++;
                        }
-                        desc->status = status & ~IRQ_AUTODETECT;
+                        desc->istate &= ~IRQS_AUTODETECT;
-                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+                        irq_shutdown(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..4af1e2b244cb 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,115 @@
 #include "internals.h"
 /**
- *      set_irq_chip - set the irq chip for an irq
+ *      irq_set_chip - set the irq chip for an irq
 *      @irq:   irq number
 *      @chip:  pointer to irq chip description structure
 */
-int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
                return -EINVAL;
-        }
        if (!chip)
                chip = &no_irq_chip;
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        irq_chip_set_defaults(chip);
        desc->irq_data.chip = chip;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        irq_put_desc_unlock(desc, flags);
+        /*
+         * For !CONFIG_SPARSE_IRQ make the irq show up in
+         * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
+         * already marked, and this call is harmless.
+         */
+        irq_reserve_irq(irq);
        return 0;
 }
-EXPORT_SYMBOL(set_irq_chip);
+EXPORT_SYMBOL(irq_set_chip);
 /**
- *      set_irq_type - set the irq trigger type for an irq
+ *      irq_set_type - set the irq trigger type for an irq
 *      @irq:   irq number
 *      @type:  IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
 */
-int set_irq_type(unsigned int irq, unsigned int type)
+int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
-        int ret = -ENXIO;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        int ret = 0;
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+                return -EINVAL;
-                return -ENODEV;
-        }
        type &= IRQ_TYPE_SENSE_MASK;
-        if (type == IRQ_TYPE_NONE)
+        if (type != IRQ_TYPE_NONE)
-                return 0;
+                ret = __irq_set_trigger(desc, irq, type);
+        irq_put_desc_busunlock(desc, flags);
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        ret = __irq_set_trigger(desc, irq, type);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
-EXPORT_SYMBOL(set_irq_type);
+EXPORT_SYMBOL(irq_set_irq_type);
 /**
- *      set_irq_data - set irq type data for an irq
+ *      irq_set_handler_data - set irq handler data for an irq
 *      @irq:   Interrupt number
 *      @data:  Pointer to interrupt specific data
 *
 *      Set the hardware irq controller data for an irq
 */
-int set_irq_data(unsigned int irq, void *data)
+int irq_set_handler_data(unsigned int irq, void *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install controller data for IRQ%d\n", irq);
                return -EINVAL;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->irq_data.handler_data = data;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        irq_put_desc_unlock(desc, flags);
        return 0;
 }
-EXPORT_SYMBOL(set_irq_data);
+EXPORT_SYMBOL(irq_set_handler_data);
 /**
- *      set_irq_msi - set MSI descriptor data for an irq
+ *      irq_set_msi_desc - set MSI descriptor data for an irq
 *      @irq:   Interrupt number
 *      @entry: Pointer to MSI descriptor data
 *
 *      Set the MSI descriptor entry for an irq
 */
-int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install msi data for IRQ%d\n", irq);
                return -EINVAL;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->irq_data.msi_desc = entry;
        if (entry)
                entry->irq = irq;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        irq_put_desc_unlock(desc, flags);
        return 0;
 }
 /**
- *      set_irq_chip_data - set irq chip data for an irq
+ *      irq_set_chip_data - set irq chip data for an irq
 *      @irq:   Interrupt number
 *      @data:  Pointer to chip specific data
 *
 *      Set the hardware irq chip data for an irq
 */
-int set_irq_chip_data(unsigned int irq, void *data)
+int irq_set_chip_data(unsigned int irq, void *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install chip data for IRQ%d\n", irq);
-                return -EINVAL;
-        }
-        if (!desc->irq_data.chip) {
-                printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
                return -EINVAL;
-        }
-        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->irq_data.chip_data = data;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        irq_put_desc_unlock(desc, flags);
        return 0;
 }
-EXPORT_SYMBOL(set_irq_chip_data);
+EXPORT_SYMBOL(irq_set_chip_data);
 struct irq_data *irq_get_irq_data(unsigned int irq)
 {
@@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(irq_get_irq_data);
-/**
+static void irq_state_clr_disabled(struct irq_desc *desc)
- *      set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
- *
- *      @irq:   Interrupt number
- *      @nest:  0 to clear / 1 to set the IRQ_NESTED_THREAD flag
- *
- *      The IRQ_NESTED_THREAD flag indicates that on
- *      request_threaded_irq() no separate interrupt thread should be
- *      created for the irq as the handler are called nested in the
- *      context of a demultiplexing interrupt handler thread.
- */
-void set_irq_nested_thread(unsigned int irq, int nest)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        if (!desc)
-                return;
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        if (nest)
-                desc->status |= IRQ_NESTED_THREAD;
-        else
-                desc->status &= ~IRQ_NESTED_THREAD;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-EXPORT_SYMBOL_GPL(set_irq_nested_thread);
-/*
- * default enable function
- */
-static void default_enable(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_data_to_desc(data);
+        irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
-        desc->irq_data.chip->irq_unmask(&desc->irq_data);
-        desc->status &= ~IRQ_MASKED;
 }
-/*
+static void irq_state_set_disabled(struct irq_desc *desc)
- * default disable function
- */
-static void default_disable(struct irq_data *data)
-{
-}
-/*
- * default startup function
- */
-static unsigned int default_startup(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_data_to_desc(data);
+        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-        desc->irq_data.chip->irq_enable(data);
-        return 0;
 }
-/*
+static void irq_state_clr_masked(struct irq_desc *desc)
- * default shutdown function
- */
-static void default_shutdown(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_data_to_desc(data);
+        irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
-        desc->irq_data.chip->irq_mask(&desc->irq_data);
-        desc->status |= IRQ_MASKED;
 }
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static void irq_state_set_masked(struct irq_desc *desc)
-/* Temporary migration helpers */
-static void compat_irq_mask(struct irq_data *data)
 {
-        data->chip->mask(data->irq);
+        irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 }
-static void compat_irq_unmask(struct irq_data *data)
+int irq_startup(struct irq_desc *desc)
 {
-        data->chip->unmask(data->irq);
+        irq_state_clr_disabled(desc);
-}
+        desc->depth = 0;
-static void compat_irq_ack(struct irq_data *data)
+        if (desc->irq_data.chip->irq_startup) {
-{
+                int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
-        data->chip->ack(data->irq);
+                irq_state_clr_masked(desc);
-}
+                return ret;
+        }
-static void compat_irq_mask_ack(struct irq_data *data)
-{
-        data->chip->mask_ack(data->irq);
-}
-static void compat_irq_eoi(struct irq_data *data)
-{
-        data->chip->eoi(data->irq);
-}
-static void compat_irq_enable(struct irq_data *data)
-{
-        data->chip->enable(data->irq);
-}
-static void compat_irq_disable(struct irq_data *data)
-{
-        data->chip->disable(data->irq);
-}
-static void compat_irq_shutdown(struct irq_data *data)
-{
-        data->chip->shutdown(data->irq);
-}
-static unsigned int compat_irq_startup(struct irq_data *data)
-{
-        return data->chip->startup(data->irq);
-}
-static int compat_irq_set_affinity(struct irq_data *data,
-                                   const struct cpumask *dest, bool force)
-{
-        return data->chip->set_affinity(data->irq, dest);
-}
-static int compat_irq_set_type(struct irq_data *data, unsigned int type)
-{
-        return data->chip->set_type(data->irq, type);
-}
-static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
-{
-        return data->chip->set_wake(data->irq, on);
-}
-static int compat_irq_retrigger(struct irq_data *data)
+        irq_enable(desc);
-{
+        return 0;
-        return data->chip->retrigger(data->irq);
 }
-static void compat_bus_lock(struct irq_data *data)
+void irq_shutdown(struct irq_desc *desc)
 {
-        data->chip->bus_lock(data->irq);
+        irq_state_set_disabled(desc);
+        desc->depth = 1;
+        if (desc->irq_data.chip->irq_shutdown)
+                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+        if (desc->irq_data.chip->irq_disable)
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
+        irq_state_set_masked(desc);
 }
-static void compat_bus_sync_unlock(struct irq_data *data)
+void irq_enable(struct irq_desc *desc)
 {
-        data->chip->bus_sync_unlock(data->irq);
+        irq_state_clr_disabled(desc);
+        if (desc->irq_data.chip->irq_enable)
+                desc->irq_data.chip->irq_enable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
+        irq_state_clr_masked(desc);
 }
-#endif
-/*
+void irq_disable(struct irq_desc *desc)
- * Fixup enable/disable function pointers
- */
-void irq_chip_set_defaults(struct irq_chip *chip)
 {
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+        irq_state_set_disabled(desc);
-        /*
+        if (desc->irq_data.chip->irq_disable) {
-         * Compat fixup functions need to be before we set the
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
-         * defaults for enable/disable/startup/shutdown
+                irq_state_set_masked(desc);
-         */
+        }
-        if (chip->enable)
-                chip->irq_enable = compat_irq_enable;
-        if (chip->disable)
-                chip->irq_disable = compat_irq_disable;
-        if (chip->shutdown)
-                chip->irq_shutdown = compat_irq_shutdown;
-        if (chip->startup)
-                chip->irq_startup = compat_irq_startup;
-#endif
-        /*
-         * The real defaults
-         */
-        if (!chip->irq_enable)
-                chip->irq_enable = default_enable;
-        if (!chip->irq_disable)
-                chip->irq_disable = default_disable;
-        if (!chip->irq_startup)
-                chip->irq_startup = default_startup;
-        /*
-         * We use chip->irq_disable, when the user provided its own. When
-         * we have default_disable set for chip->irq_disable, then we need
-         * to use default_shutdown, otherwise the irq line is not
-         * disabled on free_irq():
-         */
-        if (!chip->irq_shutdown)
-                chip->irq_shutdown = chip->irq_disable != default_disable ?
-                        chip->irq_disable : default_shutdown;
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-        if (!chip->end)
-                chip->end = dummy_irq_chip.end;
-        /*
-         * Now fix up the remaining compat handlers
-         */
-        if (chip->bus_lock)
-                chip->irq_bus_lock = compat_bus_lock;
-        if (chip->bus_sync_unlock)
-                chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
-        if (chip->mask)
-                chip->irq_mask = compat_irq_mask;
-        if (chip->unmask)
-                chip->irq_unmask = compat_irq_unmask;
-        if (chip->ack)
-                chip->irq_ack = compat_irq_ack;
-        if (chip->mask_ack)
-                chip->irq_mask_ack = compat_irq_mask_ack;
-        if (chip->eoi)
-                chip->irq_eoi = compat_irq_eoi;
-        if (chip->set_affinity)
-                chip->irq_set_affinity = compat_irq_set_affinity;
-        if (chip->set_type)
-                chip->irq_set_type = compat_irq_set_type;
-        if (chip->set_wake)
-                chip->irq_set_wake = compat_irq_set_wake;
-        if (chip->retrigger)
-                chip->irq_retrigger = compat_irq_retrigger;
-#endif
 }
 static inline void mask_ack_irq(struct irq_desc *desc)
@@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
                if (desc->irq_data.chip->irq_ack)
                        desc->irq_data.chip->irq_ack(&desc->irq_data);
        }
-        desc->status |= IRQ_MASKED;
+        irq_state_set_masked(desc);
 }
-static inline void mask_irq(struct irq_desc *desc)
+void mask_irq(struct irq_desc *desc)
 {
        if (desc->irq_data.chip->irq_mask) {
                desc->irq_data.chip->irq_mask(&desc->irq_data);
-                desc->status |= IRQ_MASKED;
+                irq_state_set_masked(desc);
        }
 }
-static inline void unmask_irq(struct irq_desc *desc)
+void unmask_irq(struct irq_desc *desc)
 {
        if (desc->irq_data.chip->irq_unmask) {
                desc->irq_data.chip->irq_unmask(&desc->irq_data);
-                desc->status &= ~IRQ_MASKED;
+                irq_state_clr_masked(desc);
        }
 }
@@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
                goto out_unlock;
-        desc->status |= IRQ_INPROGRESS;
+        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
        action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
                note_interrupt(irq, desc, action_ret);
        raw_spin_lock_irq(&desc->lock);
-        desc->status &= ~IRQ_INPROGRESS;
+        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 out_unlock:
        raw_spin_unlock_irq(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_nested_irq);
+static bool irq_check_poll(struct irq_desc *desc)
+{
+        if (!(desc->istate & IRQS_POLL_INPROGRESS))
+                return false;
+        return irq_wait_for_poll(desc);
+}
 /**
 *      handle_simple_irq - Simple and software-decoded IRQs.
 *      @irq:   the interrupt number
@@ -461,29 +293,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
 void
 handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
-        struct irqaction *action;
-        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        if (unlikely(desc->status & IRQ_INPROGRESS))
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
-                goto out_unlock;
+                if (!irq_check_poll(desc))
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+                        goto out_unlock;
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
-        action = desc->action;
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
                goto out_unlock;
-        desc->status |= IRQ_INPROGRESS;
+        handle_irq_event(desc);
-        raw_spin_unlock(&desc->lock);
-        action_ret = handle_IRQ_event(irq, action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        raw_spin_lock(&desc->lock);
-        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -501,42 +324,42 @@ out_unlock:
 void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
-        struct irqaction *action;
-        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
        mask_ack_irq(desc);
-        if (unlikely(desc->status & IRQ_INPROGRESS))
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
-                goto out_unlock;
+                if (!irq_check_poll(desc))
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+                        goto out_unlock;
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
        /*
         * If its disabled or no action available
         * keep it masked and get out of here
         */
-        action = desc->action;
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
                goto out_unlock;
-        desc->status |= IRQ_INPROGRESS;
+        handle_irq_event(desc);
-        raw_spin_unlock(&desc->lock);
-        action_ret = handle_IRQ_event(irq, action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        raw_spin_lock(&desc->lock);
-        desc->status &= ~IRQ_INPROGRESS;
-        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+        if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
                unmask_irq(desc);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+        if (desc->preflow_handler)
+                desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
 /**
 *      handle_fasteoi_irq - irq handler for transparent controllers
 *      @irq:   the interrupt number
@@ -550,42 +373,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
 void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
-        struct irqaction *action;
-        irqreturn_t action_ret;
        raw_spin_lock(&desc->lock);
-        if (unlikely(desc->status & IRQ_INPROGRESS))
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
-                goto out;
+                if (!irq_check_poll(desc))
+                        goto out;
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
        /*
         * If its disabled or no action available
         * then mask it and get out of here:
         */
-        action = desc->action;
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
-        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+                desc->istate |= IRQS_PENDING;
-                desc->status |= IRQ_PENDING;
                mask_irq(desc);
                goto out;
        }
-        desc->status |= IRQ_INPROGRESS;
+        if (desc->istate & IRQS_ONESHOT)
-        desc->status &= ~IRQ_PENDING;
+                mask_irq(desc);
-        raw_spin_unlock(&desc->lock);
-        action_ret = handle_IRQ_event(irq, action);
+        preflow_handler(desc);
-        if (!noirqdebug)
+        handle_irq_event(desc);
-                note_interrupt(irq, desc, action_ret);
-        raw_spin_lock(&desc->lock);
+out_eoi:
-        desc->status &= ~IRQ_INPROGRESS;
-out:
        desc->irq_data.chip->irq_eoi(&desc->irq_data);
+out_unlock:
        raw_spin_unlock(&desc->lock);
+        return;
+out:
+        if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
+                goto out_eoi;
+        goto out_unlock;
 }
 /**
@@ -594,7 +415,7 @@ out:
 *      @desc:  the interrupt description structure for this irq
 *
 *      Interrupt occures on the falling and/or rising edge of a hardware
- *      signal. The occurence is latched into the irq controller hardware
+ *      signal. The occurrence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
 *      is handled by the associated event handler. If this happens it
@@ -609,32 +430,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
        raw_spin_lock(&desc->lock);
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        /*
         * If we're currently running this IRQ, or its disabled,
         * we shouldn't process the IRQ. Mark it pending, handle
         * the necessary masking and go out
         */
-        if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+        if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
-                    !desc->action)) {
+                     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
-                desc->status |= (IRQ_PENDING | IRQ_MASKED);
+                if (!irq_check_poll(desc)) {
-                mask_ack_irq(desc);
+                        desc->istate |= IRQS_PENDING;
-                goto out_unlock;
+                        mask_ack_irq(desc);
+                        goto out_unlock;
+                }
        }
        kstat_incr_irqs_this_cpu(irq, desc);
        /* Start handling the irq */
        desc->irq_data.chip->irq_ack(&desc->irq_data);
-        /* Mark the IRQ currently in progress.*/
-        desc->status |= IRQ_INPROGRESS;
        do {
-                struct irqaction *action = desc->action;
+                if (unlikely(!desc->action)) {
-                irqreturn_t action_ret;
-                if (unlikely(!action)) {
                        mask_irq(desc);
                        goto out_unlock;
                }
@@ -644,26 +460,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                 * one, we could have masked the irq.
                 * Renable it, if it was not disabled in meantime.
                 */
-                if (unlikely((desc->status &
+                if (unlikely(desc->istate & IRQS_PENDING)) {
-                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
+                        if (!irqd_irq_disabled(&desc->irq_data) &&
-                              (IRQ_PENDING | IRQ_MASKED))) {
+                            irqd_irq_masked(&desc->irq_data))
-                        unmask_irq(desc);
+                                unmask_irq(desc);
                }
-                desc->status &= ~IRQ_PENDING;
+                handle_irq_event(desc);
-                raw_spin_unlock(&desc->lock);
-                action_ret = handle_IRQ_event(irq, action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
-                raw_spin_lock(&desc->lock);
-        } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+        } while ((desc->istate & IRQS_PENDING) &&
+                 !irqd_irq_disabled(&desc->irq_data));
-        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
+#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
+/**
+ *      handle_edge_eoi_irq - edge eoi type IRQ handler
+ *      @irq:   the interrupt number
+ *      @desc:  the interrupt description structure for this irq
+ *
+ * Similar as the above handle_edge_irq, but using eoi and w/o the
+ * mask/unmask logic.
+ */
+void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+        struct irq_chip *chip = irq_desc_get_chip(desc);
+        raw_spin_lock(&desc->lock);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+        /*
+         * If we're currently running this IRQ, or its disabled,
+         * we shouldn't process the IRQ. Mark it pending, handle
+         * the necessary masking and go out
+         */
+        if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+                     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+                if (!irq_check_poll(desc)) {
+                        desc->istate |= IRQS_PENDING;
+                        goto out_eoi;
+                }
+        }
+        kstat_incr_irqs_this_cpu(irq, desc);
+        do {
+                if (unlikely(!desc->action))
+                        goto out_eoi;
+                handle_irq_event(desc);
+        } while ((desc->istate & IRQS_PENDING) &&
+                 !irqd_irq_disabled(&desc->irq_data));
+out_eoi:
+        chip->irq_eoi(&desc->irq_data);
+        raw_spin_unlock(&desc->lock);
+}
+#endif
 /**
 *      handle_percpu_irq - Per CPU local irq handler
 *      @irq:   the interrupt number
@@ -674,103 +530,145 @@ out_unlock:
 void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
-        irqreturn_t action_ret;
+        struct irq_chip *chip = irq_desc_get_chip(desc);
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (desc->irq_data.chip->irq_ack)
+        if (chip->irq_ack)
-                desc->irq_data.chip->irq_ack(&desc->irq_data);
+                chip->irq_ack(&desc->irq_data);
-        action_ret = handle_IRQ_event(irq, desc->action);
+        handle_irq_event_percpu(desc, desc->action);
-        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
-        if (desc->irq_data.chip->irq_eoi)
+        if (chip->irq_eoi)
-                desc->irq_data.chip->irq_eoi(&desc->irq_data);
+                chip->irq_eoi(&desc->irq_data);
 }
 void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                  const char *name)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
-        if (!desc) {
+        if (!desc)
-                printk(KERN_ERR
-                       "Trying to install type control for IRQ%d\n", irq);
                return;
-        }
-        if (!handle)
+        if (!handle) {
                handle = handle_bad_irq;
-        else if (desc->irq_data.chip == &no_irq_chip) {
+        } else {
-                printk(KERN_WARNING "Trying to install %sinterrupt handler "
+                if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
-                       "for IRQ%d\n", is_chained ? "chained " : "", irq);
+                        goto out;
-                /*
-                 * Some ARM implementations install a handler for really dumb
-                 * interrupt hardware without setting an irq_chip. This worked
-                 * with the ARM no_irq_chip but the check in setup_irq would
-                 * prevent us to setup the interrupt at all. Switch it to
-                 * dummy_irq_chip for easy transition.
-                 */
-                desc->irq_data.chip = &dummy_irq_chip;
        }
-        chip_bus_lock(desc);
-        raw_spin_lock_irqsave(&desc->lock, flags);
        /* Uninstall? */
        if (handle == handle_bad_irq) {
                if (desc->irq_data.chip != &no_irq_chip)
                        mask_ack_irq(desc);
-                desc->status |= IRQ_DISABLED;
+                irq_state_set_disabled(desc);
                desc->depth = 1;
        }
        desc->handle_irq = handle;
        desc->name = name;
        if (handle != handle_bad_irq && is_chained) {
-                desc->status &= ~IRQ_DISABLED;
+                irq_settings_set_noprobe(desc);
-                desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+                irq_settings_set_norequest(desc);
-                desc->depth = 0;
+                irq_startup(desc);
-                desc->irq_data.chip->irq_startup(&desc->irq_data);
        }
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+out:
-        chip_bus_sync_unlock(desc);
+        irq_put_desc_busunlock(desc, flags);
-}
-EXPORT_SYMBOL_GPL(__set_irq_handler);
-void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
-                         irq_flow_handler_t handle)
-{
-        set_irq_chip(irq, chip);
-        __set_irq_handler(irq, handle, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(__irq_set_handler);
 void
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
                              irq_flow_handler_t handle, const char *name)
 {
-        set_irq_chip(irq, chip);
+        irq_set_chip(irq, chip);
-        __set_irq_handler(irq, handle, 0, name);
+        __irq_set_handler(irq, handle, 0, name);
 }
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return;
+        irq_settings_clr_and_set(desc, clr, set);
+        irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+                   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+        if (irq_settings_has_no_balance_set(desc))
+                irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+        if (irq_settings_is_per_cpu(desc))
+                irqd_set(&desc->irq_data, IRQD_PER_CPU);
+        if (irq_settings_can_move_pcntxt(desc))
+                irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
+        if (irq_settings_is_level(desc))
+                irqd_set(&desc->irq_data, IRQD_LEVEL);
+        irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+        irq_put_desc_unlock(desc, flags);
+}
+/**
+ *      irq_cpu_online - Invoke all irq_cpu_online functions.
+ *
+ *      Iterate through all irqs and invoke the chip.irq_cpu_online()
+ *      for each.
+ */
+void irq_cpu_online(void)
+{
+        struct irq_desc *desc;
+        struct irq_chip *chip;
+        unsigned long flags;
+        unsigned int irq;
+        for_each_active_irq(irq) {
+                desc = irq_to_desc(irq);
+                if (!desc)
+                        continue;
-        /* Sanitize flags */
+                raw_spin_lock_irqsave(&desc->lock, flags);
-        set &= IRQF_MODIFY_MASK;
-        clr &= IRQF_MODIFY_MASK;
-        raw_spin_lock_irqsave(&desc->lock, flags);
+                chip = irq_data_get_irq_chip(&desc->irq_data);
-        desc->status &= ~clr;
+                if (chip && chip->irq_cpu_online &&
-        desc->status |= set;
+                    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+                     !irqd_irq_disabled(&desc->irq_data)))
+                        chip->irq_cpu_online(&desc->irq_data);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
+        }
+}
+/**
+ *      irq_cpu_offline - Invoke all irq_cpu_offline functions.
+ *
+ *      Iterate through all irqs and invoke the chip.irq_cpu_offline()
+ *      for each.
+ */
+void irq_cpu_offline(void)
+{
+        struct irq_desc *desc;
+        struct irq_chip *chip;
+        unsigned long flags;
+        unsigned int irq;
+        for_each_active_irq(irq) {
+                desc = irq_to_desc(irq);
+                if (!desc)
+                        continue;
+                raw_spin_lock_irqsave(&desc->lock, flags);
+                chip = irq_data_get_irq_chip(&desc->irq_data);
+                if (chip && chip->irq_cpu_offline &&
+                    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+                     !irqd_irq_disabled(&desc->irq_data)))
+                        chip->irq_cpu_offline(&desc->irq_data);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
+        }
 }
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..306cba37e9a5
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,44 @@
+/*
+ * Debugging printout:
+ */
+#include <linux/kallsyms.h>
+#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+/* FIXME */
+#define PD(f) do { } while (0)
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+        printk("->handle_irq():  %p, ", desc->handle_irq);
+        print_symbol("%s\n", (unsigned long)desc->handle_irq);
+        printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+        print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+        printk("->action(): %p\n", desc->action);
+        if (desc->action) {
+                printk("->action->handler(): %p, ", desc->action->handler);
+                print_symbol("%s\n", (unsigned long)desc->action->handler);
+        }
+        P(IRQ_LEVEL);
+        P(IRQ_PER_CPU);
+        P(IRQ_NOPROBE);
+        P(IRQ_NOREQUEST);
+        P(IRQ_NOAUTOEN);
+        PS(IRQS_AUTODETECT);
+        PS(IRQS_REPLAY);
+        PS(IRQS_WAITING);
+        PS(IRQS_PENDING);
+        PD(IRQS_INPROGRESS);
+        PD(IRQS_DISABLED);
+        PD(IRQS_MASKED);
+}
+#undef P
+#undef PS
+#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947e..b5fcd96c7102 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
        return 0;
 }
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static void compat_noop(unsigned int irq) { }
-#define END_INIT .end = compat_noop
-#else
-#define END_INIT
-#endif
 /*
 * Generic no controller implementation
 */
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
        .irq_enable     = noop,
        .irq_disable    = noop,
        .irq_ack        = ack_bad,
-        END_INIT
 };
 /*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
        .irq_ack        = noop,
        .irq_mask       = noop,
        .irq_unmask     = noop,
-        END_INIT
 };
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..90cb55f6d7eb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
               "but no thread function available.", irq, action->name);
 }
-/**
+static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
- * handle_IRQ_event - irq action chain handler
+{
- * @irq:        the interrupt number
+        /*
- * @action:     the interrupt action chain for this irq
+         * Wake up the handler thread for this action. In case the
- *
+         * thread crashed and was killed we just pretend that we
- * Handles the action chain of an irq event
+         * handled the interrupt. The hardirq handler has disabled the
- */
+         * device interrupt, so no irq storm is lurking. If the
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+         * RUNTHREAD bit is already set, nothing to do.
+         */
+        if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+            test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+                return;
+        /*
+         * It's safe to OR the mask lockless here. We have only two
+         * places which write to threads_oneshot: This code and the
+         * irq thread.
+         *
+         * This code is the hard irq context and can never run on two
+         * cpus in parallel. If it ever does we have more serious
+         * problems than this bitmask.
+         *
+         * The irq threads of this irq which clear their "running" bit
+         * in threads_oneshot are serialized via desc->lock against
+         * each other and they are serialized against this code by
+         * IRQS_INPROGRESS.
+         *
+         * Hard irq handler:
+         *
+         *      spin_lock(desc->lock);
+         *      desc->state |= IRQS_INPROGRESS;
+         *      spin_unlock(desc->lock);
+         *      set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+         *      desc->threads_oneshot |= mask;
+         *      spin_lock(desc->lock);
+         *      desc->state &= ~IRQS_INPROGRESS;
+         *      spin_unlock(desc->lock);
+         *
+         * irq thread:
+         *
+         * again:
+         *      spin_lock(desc->lock);
+         *      if (desc->state & IRQS_INPROGRESS) {
+         *              spin_unlock(desc->lock);
+         *              while(desc->state & IRQS_INPROGRESS)
+         *                      cpu_relax();
+         *              goto again;
+         *      }
+         *      if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+         *              desc->threads_oneshot &= ~mask;
+         *      spin_unlock(desc->lock);
+         *
+         * So either the thread waits for us to clear IRQS_INPROGRESS
+         * or we are waiting in the flow handler for desc->lock to be
+         * released before we reach this point. The thread also checks
+         * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+         * threads_oneshot untouched and runs the thread another time.
+         */
+        desc->threads_oneshot |= action->thread_mask;
+        wake_up_process(action->thread);
+}
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
-        irqreturn_t ret, retval = IRQ_NONE;
+        irqreturn_t retval = IRQ_NONE;
-        unsigned int status = 0;
+        unsigned int random = 0, irq = desc->irq_data.irq;
        do {
+                irqreturn_t res;
                trace_irq_handler_entry(irq, action);
-                ret = action->handler(irq, action->dev_id);
+                res = action->handler(irq, action->dev_id);
-                trace_irq_handler_exit(irq, action, ret);
+                trace_irq_handler_exit(irq, action, res);
+                if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+                              irq, action->handler))
+                        local_irq_disable();
-                switch (ret) {
+                switch (res) {
                case IRQ_WAKE_THREAD:
                        /*
                         * Set result to handled so the spurious check
                         * does not trigger.
                         */
-                        ret = IRQ_HANDLED;
+                        res = IRQ_HANDLED;
                        /*
                         * Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
                                break;
                        }
-                        /*
+                        irq_wake_thread(desc, action);
-                         * Wake up the handler thread for this
-                         * action. In case the thread crashed and was
-                         * killed we just pretend that we handled the
-                         * interrupt. The hardirq handler above has
-                         * disabled the device interrupt, so no irq
-                         * storm is lurking.
-                         */
-                        if (likely(!test_bit(IRQTF_DIED,
-                                             &action->thread_flags))) {
-                                set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-                                wake_up_process(action->thread);
-                        }
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        status |= action->flags;
+                        random |= action->flags;
                        break;
                default:
                        break;
                }
-                retval |= ret;
+                retval |= res;
                action = action->next;
        } while (action);
-        if (status & IRQF_SAMPLE_RANDOM)
+        if (random & IRQF_SAMPLE_RANDOM)
                add_interrupt_randomness(irq);
-        local_irq_disable();
+        if (!noirqdebug)
+                note_interrupt(irq, desc, retval);
        return retval;
 }
+irqreturn_t handle_irq_event(struct irq_desc *desc)
+{
+        struct irqaction *action = desc->action;
+        irqreturn_t ret;
+        desc->istate &= ~IRQS_PENDING;
+        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+        raw_spin_unlock(&desc->lock);
+        ret = handle_irq_event_percpu(desc, action);
+        raw_spin_lock(&desc->lock);
+        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+        return ret;
+}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,27 +1,87 @@
 /*
 * IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
 */
 #include <linux/irqdesc.h>
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS        (NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS        NR_IRQS
+#endif
+#define istate core_internal_state__do_not_mess_with_it
 extern int noirqdebug;
-#define irq_data_to_desc(data)  container_of(data, struct irq_desc, irq_data)
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD  - irq action is force threaded
+ */
+enum {
+        IRQTF_RUNTHREAD,
+        IRQTF_DIED,
+        IRQTF_WARNED,
+        IRQTF_AFFINITY,
+        IRQTF_FORCED_THREAD,
+};
-/* Set default functions for irq_chip structures: */
+/*
-extern void irq_chip_set_defaults(struct irq_chip *chip);
+ * Bit masks for desc->state
+ *
+ * IRQS_AUTODETECT              - autodetection in progress
+ * IRQS_SPURIOUS_DISABLED       - was disabled due to spurious interrupt
+ *                                detection
+ * IRQS_POLL_INPROGRESS         - polling in progress
+ * IRQS_ONESHOT                 - irq is not unmasked in primary handler
+ * IRQS_REPLAY                  - irq is replayed
+ * IRQS_WAITING                 - irq is waiting
+ * IRQS_PENDING                 - irq is pending and replayed later
+ * IRQS_SUSPENDED               - irq is suspended
+ */
+enum {
+        IRQS_AUTODETECT         = 0x00000001,
+        IRQS_SPURIOUS_DISABLED  = 0x00000002,
+        IRQS_POLL_INPROGRESS    = 0x00000008,
+        IRQS_ONESHOT            = 0x00000020,
+        IRQS_REPLAY             = 0x00000040,
+        IRQS_WAITING            = 0x00000080,
+        IRQS_PENDING            = 0x00000200,
+        IRQS_SUSPENDED          = 0x00000800,
+};
+#include "debug.h"
+#include "settings.h"
-/* Set default handler: */
+#define irq_data_to_desc(data)  container_of(data, struct irq_desc, irq_data)
-extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern int irq_startup(struct irq_desc *desc);
+extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -37,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq,
                                           struct irqaction *action) { }
 #endif
-extern int irq_select_affinity_usr(unsigned int irq);
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static inline void irq_end(unsigned int irq, struct irq_desc *desc)
-{
-        if (desc->irq_data.chip && desc->irq_data.chip->end)
-                desc->irq_data.chip->end(irq);
-}
-#else
-static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
-#endif
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
@@ -64,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+{
+        return __irq_get_desc_lock(irq, flags, true);
+}
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+{
+        __irq_put_desc_unlock(desc, flags, true);
+}
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+{
+        return __irq_get_desc_lock(irq, flags, false);
+}
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+        __irq_put_desc_unlock(desc, flags, false);
+}
 /*
- * Debugging printout:
+ * Manipulation functions for irq_data.state
 */
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+        d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+}
-#include <linux/kallsyms.h>
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+        d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+}
-static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
 {
-        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+        d->state_use_accessors &= ~mask;
-                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
-        printk("->handle_irq():  %p, ", desc->handle_irq);
-        print_symbol("%s\n", (unsigned long)desc->handle_irq);
-        printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
-        print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
-        printk("->action(): %p\n", desc->action);
-        if (desc->action) {
-                printk("->action->handler(): %p, ", desc->action->handler);
-                print_symbol("%s\n", (unsigned long)desc->action->handler);
-        }
-        P(IRQ_INPROGRESS);
-        P(IRQ_DISABLED);
-        P(IRQ_PENDING);
-        P(IRQ_REPLAY);
-        P(IRQ_AUTODETECT);
-        P(IRQ_WAITING);
-        P(IRQ_LEVEL);
-        P(IRQ_MASKED);
-#ifdef CONFIG_IRQ_PER_CPU
-        P(IRQ_PER_CPU);
-#endif
-        P(IRQ_NOPROBE);
-        P(IRQ_NOREQUEST);
-        P(IRQ_NOAUTOEN);
 }
-#undef P
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+        d->state_use_accessors |= mask;
+}
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+        return d->state_use_accessors & mask;
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f20230e67..2c039c9b9383 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_data.chip_data = NULL;
        desc->irq_data.handler_data = NULL;
        desc->irq_data.msi_desc = NULL;
-        desc->status = IRQ_DEFAULT_INIT_FLAGS;
+        irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
        desc->handle_irq = handle_bad_irq;
        desc->depth = 1;
        desc->irq_count = 0;
@@ -94,7 +95,7 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
 #ifdef CONFIG_SPARSE_IRQ
@@ -197,13 +198,12 @@ err:
        return -ENOMEM;
 }
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+static int irq_expand_nr_irqs(unsigned int nr)
 {
-        int res = irq_alloc_descs(irq, irq, 1, node);
+        if (nr > IRQ_BITMAP_BITS)
+                return -ENOMEM;
-        if (res == -EEXIST || res == irq)
+        nr_irqs = nr;
-                return irq_to_desc(irq);
+        return 0;
-        return NULL;
 }
 int __init early_irq_init(void)
@@ -217,6 +217,15 @@ int __init early_irq_init(void)
        initcnt = arch_probe_nr_irqs();
        printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+        if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+                nr_irqs = IRQ_BITMAP_BITS;
+        if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+                initcnt = IRQ_BITMAP_BITS;
+        if (initcnt > nr_irqs)
+                nr_irqs = initcnt;
        for (i = 0; i < initcnt; i++) {
                desc = alloc_desc(i, node);
                set_bit(i, allocated_irqs);
@@ -229,7 +238,6 @@ int __init early_irq_init(void)
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS-1] = {
-                .status         = IRQ_DEFAULT_INIT_FLAGS,
                .handle_irq     = handle_bad_irq,
                .depth          = 1,
                .lock           = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -251,8 +259,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq_data.irq = i;
                desc[i].irq_data.chip = &no_irq_chip;
-                /* TODO : do this allocation on-demand ... */
                desc[i].kstat_irqs = alloc_percpu(unsigned int);
+                irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
                alloc_masks(desc + i, GFP_KERNEL, node);
                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -265,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        return irq_to_desc(irq);
-}
 static void free_desc(unsigned int irq)
 {
        dynamic_irq_cleanup(irq);
@@ -277,24 +280,14 @@ static void free_desc(unsigned int irq)
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
-#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
-        struct irq_desc *desc;
-        unsigned int i;
-        for (i = 0; i < cnt; i++) {
-                desc = irq_to_desc(start + i);
-                if (desc && !desc->kstat_irqs) {
-                        unsigned int __percpu *stats = alloc_percpu(unsigned int);
-                        if (!stats)
-                                return -1;
-                        if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
-                                free_percpu(stats);
-                }
-        }
-#endif
        return start;
 }
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+        return -ENOMEM;
+}
 #endif /* !CONFIG_SPARSE_IRQ */
 /* Dynamic interrupt handling */
@@ -338,14 +331,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
        mutex_lock(&sparse_irq_lock);
-        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+        start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+                                           from, cnt, 0);
        ret = -EEXIST;
        if (irq >=0 && start != irq)
                goto err;
-        ret = -ENOMEM;
+        if (start + cnt > nr_irqs) {
-        if (start >= nr_irqs)
+                ret = irq_expand_nr_irqs(start + cnt);
-                goto err;
+                if (ret)
+                        goto err;
+        }
        bitmap_set(allocated_irqs, start, cnt);
        mutex_unlock(&sparse_irq_lock);
@@ -392,6 +388,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
        return find_next_bit(allocated_irqs, nr_irqs, offset);
 }
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (desc) {
+                if (bus)
+                        chip_bus_lock(desc);
+                raw_spin_lock_irqsave(&desc->lock, *flags);
+        }
+        return desc;
+}
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        if (bus)
+                chip_bus_sync_unlock(desc);
+}
 /**
 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
 * @irq:        irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..07c1611f3899 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
 #include "internals.h"
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+static int __init setup_forced_irqthreads(char *arg)
+{
+        force_irqthreads = true;
+        return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *      @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
 void synchronize_irq(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned int status;
+        bool inprogress;
        if (!desc)
                return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
                 * Wait until we're out of the critical section.  This might
                 * give the wrong answer due to the lack of memory barriers.
                 */
-                while (desc->status & IRQ_INPROGRESS)
+                while (irqd_irq_inprogress(&desc->irq_data))
                        cpu_relax();
                /* Ok, that indicated we're done: double-check carefully. */
                raw_spin_lock_irqsave(&desc->lock, flags);
-                status = desc->status;
+                inprogress = irqd_irq_inprogress(&desc->irq_data);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
                /* Oops, that failed? */
-        } while (status & IRQ_INPROGRESS);
+        } while (inprogress);
        /*
         * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
+        if (!desc || !irqd_can_balance(&desc->irq_data) ||
-            !desc->irq_data.chip->irq_set_affinity)
+            !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
                return 0;
        return 1;
@@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
        }
 }
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+        return irqd_can_move_in_process_context(data);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+        return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+        cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+        cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
+static inline bool irq_move_pending(struct irq_data *data) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+{
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        struct irq_desc *desc = irq_data_to_desc(data);
+        int ret = 0;
+        if (!chip || !chip->irq_set_affinity)
+                return -EINVAL;
+        if (irq_can_move_pcntxt(data)) {
+                ret = chip->irq_set_affinity(data, mask, false);
+                switch (ret) {
+                case IRQ_SET_MASK_OK:
+                        cpumask_copy(data->affinity, mask);
+                case IRQ_SET_MASK_OK_NOCOPY:
+                        irq_set_thread_affinity(desc);
+                        ret = 0;
+                }
+        } else {
+                irqd_set_move_pending(data);
+                irq_copy_pending(desc, mask);
+        }
+        if (desc->affinity_notify) {
+                kref_get(&desc->affinity_notify->kref);
+                schedule_work(&desc->affinity_notify->work);
+        }
+        irqd_set(data, IRQD_AFFINITY_SET);
+        return ret;
+}
 /**
 *      irq_set_affinity - Set the irq affinity of a given irq
 *      @irq:           Interrupt to set affinity
- *      @cpumask:       cpumask
+ *      @mask:          cpumask
 *
 */
-int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        struct irq_chip *chip = desc->irq_data.chip;
        unsigned long flags;
+        int ret;
-        if (!chip->irq_set_affinity)
+        if (!desc)
                return -EINVAL;
        raw_spin_lock_irqsave(&desc->lock, flags);
+        ret =  __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PCNTXT) {
-                if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-                        cpumask_copy(desc->irq_data.affinity, cpumask);
-                        irq_set_thread_affinity(desc);
-                }
-        }
-        else {
-                desc->status |= IRQ_MOVE_PENDING;
-                cpumask_copy(desc->pending_mask, cpumask);
-        }
-#else
-        if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-                cpumask_copy(desc->irq_data.affinity, cpumask);
-                irq_set_thread_affinity(desc);
-        }
-#endif
-        desc->status |= IRQ_AFFINITY_SET;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        return 0;
+        return ret;
 }
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        if (!desc)
+                return -EINVAL;
+        desc->affinity_hint = m;
+        irq_put_desc_unlock(desc, flags);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+static void irq_affinity_notify(struct work_struct *work)
+{
+        struct irq_affinity_notify *notify =
+                container_of(work, struct irq_affinity_notify, work);
+        struct irq_desc *desc = irq_to_desc(notify->irq);
+        cpumask_var_t cpumask;
+        unsigned long flags;
+        if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
+                goto out;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        if (irq_move_pending(&desc->irq_data))
+                irq_get_pending(cpumask, desc);
+        else
+                cpumask_copy(cpumask, desc->irq_data.affinity);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        notify->notify(notify, cpumask);
+        free_cpumask_var(cpumask);
+out:
+        kref_put(&notify->kref, notify->release);
+}
+/**
+ *      irq_set_affinity_notifier - control notification of IRQ affinity changes
+ *      @irq:           Interrupt for which to enable/disable notification
+ *      @notify:        Context for notification, or %NULL to disable
+ *                      notification.  Function pointers must be initialised;
+ *                      the other fields will be initialised by this function.
+ *
+ *      Must be called in process context.  Notification may only be enabled
+ *      after the IRQ is allocated and must be disabled before the IRQ is
+ *      freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_affinity_notify *old_notify;
        unsigned long flags;
+        /* The release function is promised process context */
+        might_sleep();
        if (!desc)
                return -EINVAL;
+        /* Complete initialisation of *notify */
+        if (notify) {
+                notify->irq = irq;
+                kref_init(&notify->kref);
+                INIT_WORK(&notify->work, irq_affinity_notify);
+        }
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc->affinity_hint = m;
+        old_notify = desc->affinity_notify;
+        desc->affinity_notify = notify;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        if (old_notify)
+                kref_put(&old_notify->kref, old_notify->release);
        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
 * Generic version of the affinity autoselector.
 */
-static int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
+        struct irq_chip *chip = irq_desc_get_chip(desc);
+        struct cpumask *set = irq_default_affinity;
+        int ret;
+        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
                return 0;
@@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
         * Preserve an userspace affinity setup, but make sure that
         * one of the targets is online.
         */
-        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+        if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
-                if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
+                if (cpumask_intersects(desc->irq_data.affinity,
-                    < nr_cpu_ids)
+                                       cpu_online_mask))
-                        goto set_affinity;
+                        set = desc->irq_data.affinity;
                else
-                        desc->status &= ~IRQ_AFFINITY_SET;
+                        irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
        }
-        cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
+        cpumask_and(mask, cpu_online_mask, set);
-set_affinity:
+        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
-        desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                cpumask_copy(desc->irq_data.affinity, mask);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_set_thread_affinity(desc);
+        }
        return 0;
 }
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
 {
        return irq_select_affinity(irq);
 }
@@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
 /*
 * Called when affinity is set via /proc/irq
 */
-int irq_select_affinity_usr(unsigned int irq)
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
        int ret;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        ret = setup_affinity(irq, desc);
+        ret = setup_affinity(irq, desc, mask);
-        if (!ret)
-                irq_set_thread_affinity(desc);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
        return 0;
 }
@@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
        if (suspend) {
                if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
                        return;
-                desc->status |= IRQ_SUSPENDED;
+                desc->istate |= IRQS_SUSPENDED;
        }
-        if (!desc->depth++) {
+        if (!desc->depth++)
-                desc->status |= IRQ_DISABLED;
+                irq_disable(desc);
-                desc->irq_data.chip->irq_disable(&desc->irq_data);
+}
-        }
+static int __disable_irq_nosync(unsigned int irq)
+{
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        if (!desc)
+                return -EINVAL;
+        __disable_irq(desc, irq, false);
+        irq_put_desc_busunlock(desc, flags);
+        return 0;
 }
 /**
@@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 */
 void disable_irq_nosync(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        __disable_irq_nosync(irq);
-        unsigned long flags;
-        if (!desc)
-                return;
-        chip_bus_lock(desc);
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        __disable_irq(desc, irq, false);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        chip_bus_sync_unlock(desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
 */
 void disable_irq(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        if (!__disable_irq_nosync(irq))
-        if (!desc)
-                return;
-        disable_irq_nosync(irq);
-        if (desc->action)
                synchronize_irq(irq);
 }
 EXPORT_SYMBOL(disable_irq);
 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
-        if (resume)
+        if (resume) {
-                desc->status &= ~IRQ_SUSPENDED;
+                if (!(desc->istate & IRQS_SUSPENDED)) {
+                        if (!desc->action)
+                                return;
+                        if (!(desc->action->flags & IRQF_FORCE_RESUME))
+                                return;
+                        /* Pretend that it got disabled ! */
+                        desc->depth++;
+                }
+                desc->istate &= ~IRQS_SUSPENDED;
+        }
        switch (desc->depth) {
        case 0:
@@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
                break;
        case 1: {
-                unsigned int status = desc->status & ~IRQ_DISABLED;
+                if (desc->istate & IRQS_SUSPENDED)
-                if (desc->status & IRQ_SUSPENDED)
                        goto err_out;
                /* Prevent probing on this irq: */
-                desc->status = status | IRQ_NOPROBE;
+                irq_settings_set_noprobe(desc);
+                irq_enable(desc);
                check_irq_resend(desc, irq);
                /* fall-through */
        }
@@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 */
 void enable_irq(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        if (!desc)
                return;
+        if (WARN(!desc->irq_data.chip,
+                 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+                goto out;
-        if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
-            KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
-                return;
-        chip_bus_lock(desc);
-        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
+out:
-        chip_bus_sync_unlock(desc);
+        irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 }
 /**
- *      set_irq_wake - control irq power management wakeup
+ *      irq_set_irq_wake - control irq power management wakeup
 *      @irq:   interrupt to control
 *      @on:    enable/disable power management wakeup
 *
@@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 *      Wakeup mode lets this IRQ wake the system from sleep
 *      states like "suspend to RAM".
 */
-int set_irq_wake(unsigned int irq, unsigned int on)
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        int ret = 0;
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
-        raw_spin_lock_irqsave(&desc->lock, flags);
        if (on) {
                if (desc->wake_depth++ == 0) {
                        ret = set_irq_wake_real(irq, on);
                        if (ret)
                                desc->wake_depth = 0;
                        else
-                                desc->status |= IRQ_WAKEUP;
+                                irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
                }
        } else {
                if (desc->wake_depth == 0) {
@@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
                        if (ret)
                                desc->wake_depth = 1;
                        else
-                                desc->status &= ~IRQ_WAKEUP;
+                                irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
                }
        }
+        irq_put_desc_busunlock(desc, flags);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
-EXPORT_SYMBOL(set_irq_wake);
+EXPORT_SYMBOL(irq_set_irq_wake);
 /*
 * Internal function that tells the architecture code whether a
@@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake);
 */
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irqaction *action;
        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        int canrequest = 0;
        if (!desc)
                return 0;
-        if (desc->status & IRQ_NOREQUEST)
+        if (irq_settings_can_request(desc)) {
-                return 0;
+                if (desc->action)
+                        if (irqflags & desc->action->flags & IRQF_SHARED)
-        raw_spin_lock_irqsave(&desc->lock, flags);
+                                canrequest =1;
-        action = desc->action;
+        }
-        if (action)
+        irq_put_desc_unlock(desc, flags);
-                if (irqflags & action->flags & IRQF_SHARED)
+        return canrequest;
-                        action = NULL;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        return !action;
-}
-void compat_irq_chip_set_default_handler(struct irq_desc *desc)
-{
-        /*
-         * If the architecture still has not overriden
-         * the flow handler then zap the default. This
-         * should catch incorrect flow-type setting.
-         */
-        if (desc->handle_irq == &handle_bad_irq)
-                desc->handle_irq = NULL;
 }
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                      unsigned long flags)
 {
-        int ret;
        struct irq_chip *chip = desc->irq_data.chip;
+        int ret, unmask = 0;
        if (!chip || !chip->irq_set_type) {
                /*
@@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                return 0;
        }
+        flags &= IRQ_TYPE_SENSE_MASK;
+        if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
+                if (!irqd_irq_masked(&desc->irq_data))
+                        mask_irq(desc);
+                if (!irqd_irq_disabled(&desc->irq_data))
+                        unmask = 1;
+        }
        /* caller masked out all except trigger mode flags */
        ret = chip->irq_set_type(&desc->irq_data, flags);
-        if (ret)
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+                irqd_set(&desc->irq_data, flags);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                flags = irqd_get_trigger_type(&desc->irq_data);
+                irq_settings_set_trigger_mask(desc, flags);
+                irqd_clear(&desc->irq_data, IRQD_LEVEL);
+                irq_settings_clr_level(desc);
+                if (flags & IRQ_TYPE_LEVEL_MASK) {
+                        irq_settings_set_level(desc);
+                        irqd_set(&desc->irq_data, IRQD_LEVEL);
+                }
+                ret = 0;
+                break;
+        default:
                pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
                       flags, irq, chip->irq_set_type);
-        else {
-                if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
-                        flags |= IRQ_LEVEL;
-                /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
-                desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
-                desc->status |= flags;
-                if (chip != desc->irq_data.chip)
-                        irq_chip_set_defaults(desc->irq_data.chip);
        }
+        if (unmask)
+                unmask_irq(desc);
        return ret;
 }
@@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 * handler finished. unmask if the interrupt has not been disabled and
 * is marked MASKED.
 */
-static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+static void irq_finalize_oneshot(struct irq_desc *desc,
+                                 struct irqaction *action, bool force)
 {
+        if (!(desc->istate & IRQS_ONESHOT))
+                return;
 again:
        chip_bus_lock(desc);
        raw_spin_lock_irq(&desc->lock);
@@ -522,26 +651,42 @@ again:
         * The thread is faster done than the hard interrupt handler
         * on the other CPU. If we unmask the irq line then the
         * interrupt can come in again and masks the line, leaves due
-         * to IRQ_INPROGRESS and the irq line is masked forever.
+         * to IRQS_INPROGRESS and the irq line is masked forever.
+         *
+         * This also serializes the state of shared oneshot handlers
+         * versus "desc->threads_onehsot |= action->thread_mask;" in
+         * irq_wake_thread(). See the comment there which explains the
+         * serialization.
         */
-        if (unlikely(desc->status & IRQ_INPROGRESS)) {
+        if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
                raw_spin_unlock_irq(&desc->lock);
                chip_bus_sync_unlock(desc);
                cpu_relax();
                goto again;
        }
-        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+        /*
-                desc->status &= ~IRQ_MASKED;
+         * Now check again, whether the thread should run. Otherwise
-                desc->irq_data.chip->irq_unmask(&desc->irq_data);
+         * we would clear the threads_oneshot bit of this thread which
-        }
+         * was just set.
+         */
+        if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+                goto out_unlock;
+        desc->threads_oneshot &= ~action->thread_mask;
+        if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
+            irqd_irq_masked(&desc->irq_data))
+                unmask_irq(desc);
+out_unlock:
        raw_spin_unlock_irq(&desc->lock);
        chip_bus_sync_unlock(desc);
 }
 #ifdef CONFIG_SMP
 /*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Check whether we need to chasnge the affinity of the interrupt thread.
 */
 static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +718,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 /*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static void
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+        local_bh_disable();
+        action->thread_fn(action->irq, action->dev_id);
+        irq_finalize_oneshot(desc, action, false);
+        local_bh_enable();
+}
+/*
+ * Interrupts explicitely requested as threaded interupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+        action->thread_fn(action->irq, action->dev_id);
+        irq_finalize_oneshot(desc, action, false);
+}
+/*
 * Interrupt handler thread
 */
 static int irq_thread(void *data)
@@ -582,7 +753,14 @@ static int irq_thread(void *data)
        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
-        int wake, oneshot = desc->status & IRQ_ONESHOT;
+        void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+        int wake;
+        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+                                        &action->thread_flags))
+                handler_fn = irq_forced_thread_fn;
+        else
+                handler_fn = irq_thread_fn;
        sched_setscheduler(current, SCHED_FIFO, &param);
        current->irqaction = action;
@@ -594,23 +772,19 @@ static int irq_thread(void *data)
                atomic_inc(&desc->threads_active);
                raw_spin_lock_irq(&desc->lock);
-                if (unlikely(desc->status & IRQ_DISABLED)) {
+                if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
                        /*
                         * CHECKME: We might need a dedicated
                         * IRQ_THREAD_PENDING flag here, which
                         * retriggers the thread in check_irq_resend()
-                         * but AFAICT IRQ_PENDING should be fine as it
+                         * but AFAICT IRQS_PENDING should be fine as it
                         * retriggers the interrupt itself --- tglx
                         */
-                        desc->status |= IRQ_PENDING;
+                        desc->istate |= IRQS_PENDING;
                        raw_spin_unlock_irq(&desc->lock);
                } else {
                        raw_spin_unlock_irq(&desc->lock);
+                        handler_fn(desc, action);
-                        action->thread_fn(action->irq, action->dev_id);
-                        if (oneshot)
-                                irq_finalize_oneshot(action->irq, desc);
                }
                wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +793,9 @@ static int irq_thread(void *data)
                        wake_up(&desc->wait_for_threads);
        }
+        /* Prevent a stale desc->threads_oneshot */
+        irq_finalize_oneshot(desc, action, true);
        /*
         * Clear irqaction. Otherwise exit_irq_thread() would make
         * fuzz about an active irq thread going into nirvana.
@@ -633,6 +810,7 @@ static int irq_thread(void *data)
 void exit_irq_thread(void)
 {
        struct task_struct *tsk = current;
+        struct irq_desc *desc;
        if (!tsk->irqaction)
                return;
@@ -641,6 +819,14 @@ void exit_irq_thread(void)
               "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
               tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+        desc = irq_to_desc(tsk->irqaction->irq);
+        /*
+         * Prevent a stale desc->threads_oneshot. Must be called
+         * before setting the IRQTF_DIED flag.
+         */
+        irq_finalize_oneshot(desc, tsk->irqaction, true);
        /*
         * Set the THREAD DIED flag to prevent further wakeups of the
         * soon to be gone threaded handler.
@@ -648,6 +834,22 @@ void exit_irq_thread(void)
        set_bit(IRQTF_DIED, &tsk->irqaction->flags);
 }
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+        if (!force_irqthreads)
+                return;
+        if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+                return;
+        new->flags |= IRQF_ONESHOT;
+        if (!new->thread_fn) {
+                set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+                new->thread_fn = new->handler;
+                new->handler = irq_default_primary_handler;
+        }
+}
 /*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
@@ -657,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
        struct irqaction *old, **old_ptr;
        const char *old_name = NULL;
-        unsigned long flags;
+        unsigned long flags, thread_mask = 0;
-        int nested, shared = 0;
+        int ret, nested, shared = 0;
-        int ret;
+        cpumask_var_t mask;
        if (!desc)
                return -EINVAL;
@@ -683,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                rand_initialize_irq(irq);
        }
-        /* Oneshot interrupts are not allowed with shared */
-        if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
-                return -EINVAL;
        /*
         * Check whether the interrupt nests into another interrupt
         * thread.
         */
-        nested = desc->status & IRQ_NESTED_THREAD;
+        nested = irq_settings_is_nested_thread(desc);
        if (nested) {
                if (!new->thread_fn)
                        return -EINVAL;
@@ -701,6 +899,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * dummy function which warns when called.
                 */
                new->handler = irq_nested_primary_handler;
+        } else {
+                irq_setup_forced_threading(new);
        }
        /*
@@ -724,6 +924,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                new->thread = t;
        }
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+                ret = -ENOMEM;
+                goto out_thread;
+        }
        /*
         * The following block of code has to be executed atomically
         */
@@ -735,32 +940,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * Can't share interrupts unless both agree to and are
                 * the same type (level, edge, polarity). So both flag
                 * fields must have IRQF_SHARED set and the bits which
-                 * set the trigger type must match.
+                 * set the trigger type must match. Also all must
+                 * agree on ONESHOT.
                 */
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
-                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+                    ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
                        old_name = old->name;
                        goto mismatch;
                }
-#if defined(CONFIG_IRQ_PER_CPU)
                /* All handlers must agree on per-cpuness */
                if ((old->flags & IRQF_PERCPU) !=
                    (new->flags & IRQF_PERCPU))
                        goto mismatch;
-#endif
                /* add new interrupt at end of irq queue */
                do {
+                        thread_mask |= old->thread_mask;
                        old_ptr = &old->next;
                        old = *old_ptr;
                } while (old);
                shared = 1;
        }
-        if (!shared) {
+        /*
-                irq_chip_set_defaults(desc->irq_data.chip);
+         * Setup the thread mask for this irqaction. Unlikely to have
+         * 32 resp 64 irqs sharing one line, but who knows.
+         */
+        if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+                ret = -EBUSY;
+                goto out_mask;
+        }
+        new->thread_mask = 1 << ffz(thread_mask);
+        if (!shared) {
                init_waitqueue_head(&desc->wait_for_threads);
                /* Setup the type (level, edge polarity) if configured: */
@@ -769,42 +983,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                                        new->flags & IRQF_TRIGGER_MASK);
                        if (ret)
-                                goto out_thread;
+                                goto out_mask;
-                } else
+                }
-                        compat_irq_chip_set_default_handler(desc);
-#if defined(CONFIG_IRQ_PER_CPU)
+                desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-                if (new->flags & IRQF_PERCPU)
+                                  IRQS_ONESHOT | IRQS_WAITING);
-                        desc->status |= IRQ_PER_CPU;
+                irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-#endif
-                desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
+                if (new->flags & IRQF_PERCPU) {
-                                  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+                        irqd_set(&desc->irq_data, IRQD_PER_CPU);
+                        irq_settings_set_per_cpu(desc);
+                }
                if (new->flags & IRQF_ONESHOT)
-                        desc->status |= IRQ_ONESHOT;
+                        desc->istate |= IRQS_ONESHOT;
-                if (!(desc->status & IRQ_NOAUTOEN)) {
+                if (irq_settings_can_autoenable(desc))
-                        desc->depth = 0;
+                        irq_startup(desc);
-                        desc->status &= ~IRQ_DISABLED;
+                else
-                        desc->irq_data.chip->irq_startup(&desc->irq_data);
-                } else
                        /* Undo nested disables: */
                        desc->depth = 1;
                /* Exclude IRQ from balancing if requested */
-                if (new->flags & IRQF_NOBALANCING)
+                if (new->flags & IRQF_NOBALANCING) {
-                        desc->status |= IRQ_NO_BALANCING;
+                        irq_settings_set_no_balancing(desc);
+                        irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+                }
                /* Set default affinity mask once everything is setup */
-                setup_affinity(irq, desc);
+                setup_affinity(irq, desc, mask);
-        } else if ((new->flags & IRQF_TRIGGER_MASK)
+        } else if (new->flags & IRQF_TRIGGER_MASK) {
-                        && (new->flags & IRQF_TRIGGER_MASK)
+                unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
-                                != (desc->status & IRQ_TYPE_SENSE_MASK)) {
+                unsigned int omsk = irq_settings_get_trigger_mask(desc);
-                /* hope the handler works with the actual trigger mode... */
-                pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
+                if (nmsk != omsk)
-                                irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
+                        /* hope the handler works with current  trigger mode */
-                                (int)(new->flags & IRQF_TRIGGER_MASK));
+                        pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+                                   irq, nmsk, omsk);
        }
        new->irq = irq;
@@ -818,8 +1034,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         * Check whether we disabled the irq via the spurious handler
         * before. Reenable it and give it another chance.
         */
-        if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
+        if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
-                desc->status &= ~IRQ_SPURIOUS_DISABLED;
+                desc->istate &= ~IRQS_SPURIOUS_DISABLED;
                __enable_irq(desc, irq, false);
        }
@@ -835,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        register_irq_proc(irq, desc);
        new->dir = NULL;
        register_handler_proc(irq, new);
+        free_cpumask_var(mask);
        return 0;
@@ -849,8 +1066,11 @@ mismatch:
 #endif
        ret = -EBUSY;
-out_thread:
+out_mask:
        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        free_cpumask_var(mask);
+out_thread:
        if (new->thread) {
                struct task_struct *t = new->thread;
@@ -871,9 +1091,14 @@ out_thread:
 */
 int setup_irq(unsigned int irq, struct irqaction *act)
 {
+        int retval;
        struct irq_desc *desc = irq_to_desc(irq);
-        return __setup_irq(irq, desc, act);
+        chip_bus_lock(desc);
+        retval = __setup_irq(irq, desc, act);
+        chip_bus_sync_unlock(desc);
+        return retval;
 }
 EXPORT_SYMBOL_GPL(setup_irq);
@@ -924,13 +1149,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 #endif
        /* If this was the last handler, shut down the IRQ line: */
-        if (!desc->action) {
+        if (!desc->action)
-                desc->status |= IRQ_DISABLED;
+                irq_shutdown(desc);
-                if (desc->irq_data.chip->irq_shutdown)
-                        desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-                else
-                        desc->irq_data.chip->irq_disable(&desc->irq_data);
-        }
 #ifdef CONFIG_SMP
        /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1224,11 @@ void free_irq(unsigned int irq, void *dev_id)
        if (!desc)
                return;
+#ifdef CONFIG_SMP
+        if (WARN_ON(desc->affinity_notify))
+                desc->affinity_notify = NULL;
+#endif
        chip_bus_lock(desc);
        kfree(__free_irq(irq, dev_id));
        chip_bus_sync_unlock(desc);
@@ -1074,7 +1299,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (desc->status & IRQ_NOREQUEST)
+        if (!irq_settings_can_request(desc))
                return -EINVAL;
        if (!handler) {
@@ -1100,7 +1325,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        if (retval)
                kfree(action);
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
@@ -1149,7 +1374,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (desc->status & IRQ_NESTED_THREAD) {
+        if (irq_settings_is_nested_thread(desc)) {
                ret = request_threaded_irq(irq, NULL, handler,
                                           flags, name, dev_id);
                return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
 #include "internals.h"
-void move_masked_irq(int irq)
+void irq_move_masked_irq(struct irq_data *idata)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_desc *desc = irq_data_to_desc(idata);
-        struct irq_chip *chip = desc->irq_data.chip;
+        struct irq_chip *chip = idata->chip;
-        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+        if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
                return;
        /*
         * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
         */
-        if (CHECK_IRQ_PER_CPU(desc->status)) {
+        if (!irqd_can_balance(&desc->irq_data)) {
                WARN_ON(1);
                return;
        }
-        desc->status &= ~IRQ_MOVE_PENDING;
+        irqd_clr_move_pending(&desc->irq_data);
        if (unlikely(cpumask_empty(desc->pending_mask)))
                return;
@@ -35,7 +35,7 @@ void move_masked_irq(int irq)
         * do the disable, re-program, enable sequence.
         * This is *not* particularly important for level triggered
         * but in a edge trigger case, we might be setting rte
-         * when an active trigger is comming in. This could
+         * when an active trigger is coming in. This could
         * cause some ioapics to mal-function.
         * Being paranoid i guess!
         *
@@ -53,15 +53,14 @@ void move_masked_irq(int irq)
        cpumask_clear(desc->pending_mask);
 }
-void move_native_irq(int irq)
+void irq_move_irq(struct irq_data *idata)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        bool masked;
-        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+        if (likely(!irqd_is_setaffinity_pending(idata)))
                return;
-        if (unlikely(desc->status & IRQ_DISABLED))
+        if (unlikely(irqd_irq_disabled(idata)))
                return;
        /*
@@ -69,10 +68,10 @@ void move_native_irq(int irq)
         * threaded interrupt with ONESHOT set, we can end up with an
         * interrupt storm.
         */
-        masked = desc->status & IRQ_MASKED;
+        masked = irqd_irq_masked(idata);
        if (!masked)
-                desc->irq_data.chip->irq_mask(&desc->irq_data);
+                idata->chip->irq_mask(idata);
-        move_masked_irq(irq);
+        irq_move_masked_irq(idata);
        if (!masked)
-                desc->irq_data.chip->irq_unmask(&desc->irq_data);
+                idata->chip->irq_unmask(idata);
 }
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
 * During system-wide suspend or hibernation device drivers need to be prevented
 * from receiving interrupts and this function is provided for this purpose.
 * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQ_SUSPENDED flag for each of them.
+ * and sets the IRQS_SUSPENDED flag for each of them.
 */
 void suspend_device_irqs(void)
 {
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
        }
        for_each_irq_desc(irq, desc)
-                if (desc->status & IRQ_SUSPENDED)
+                if (desc->istate & IRQS_SUSPENDED)
                        synchronize_irq(irq);
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
 *
 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQ_SUSPENDED flag set.
+ * have the IRQS_SUSPENDED flag set.
 */
 void resume_device_irqs(void)
 {
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
-                if (!(desc->status & IRQ_SUSPENDED))
-                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
        struct irq_desc *desc;
        int irq;
-        for_each_irq_desc(irq, desc)
+        for_each_irq_desc(irq, desc) {
-                if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+                if (irqd_is_wakeup_set(&desc->irq_data)) {
-                        return -EBUSY;
+                        if (desc->istate & IRQS_PENDING)
+                                return -EBUSY;
+                        continue;
+                }
+                /*
+                 * Check the non wakeup interrupts whether they need
+                 * to be masked before finally going into suspend
+                 * state. That's for hardware which has no wakeup
+                 * source configuration facility. The chip
+                 * implementation indicates that with
+                 * IRQCHIP_MASK_ON_SUSPEND.
+                 */
+                if (desc->istate & IRQS_SUSPENDED &&
+                    irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+                        mask_irq(desc);
+        }
        return 0;
 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..834899f2500f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
 #include "internals.h"
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
        const struct cpumask *mask = desc->irq_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PENDING)
+        if (irqd_is_setaffinity_pending(&desc->irq_data))
                mask = desc->pending_mask;
 #endif
        seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
        cpumask_var_t new_value;
        int err;
-        if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
+        if (!irq_can_set_affinity(irq) || no_irq_affinity)
-            irq_balancing_disabled(irq))
                return -EIO;
        if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
        if (!cpumask_intersects(new_value, cpu_online_mask)) {
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-                err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+                err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
        } else {
                irq_set_affinity(irq, new_value);
                err = count;
@@ -357,3 +357,83 @@ void init_irq_proc(void)
        }
 }
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+        return 0;
+}
+#ifndef ACTUAL_NR_IRQS
+# define ACTUAL_NR_IRQS nr_irqs
+#endif
+int show_interrupts(struct seq_file *p, void *v)
+{
+        static int prec;
+        unsigned long flags, any_count = 0;
+        int i = *(loff_t *) v, j;
+        struct irqaction *action;
+        struct irq_desc *desc;
+        if (i > ACTUAL_NR_IRQS)
+                return 0;
+        if (i == ACTUAL_NR_IRQS)
+                return arch_show_interrupts(p, prec);
+        /* print header and calculate the width of the first column */
+        if (i == 0) {
+                for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+                        j *= 10;
+                seq_printf(p, "%*s", prec + 8, "");
+                for_each_online_cpu(j)
+                        seq_printf(p, "CPU%-8d", j);
+                seq_putc(p, '\n');
+        }
+        desc = irq_to_desc(i);
+        if (!desc)
+                return 0;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        for_each_online_cpu(j)
+                any_count |= kstat_irqs_cpu(i, j);
+        action = desc->action;
+        if (!action && !any_count)
+                goto out;
+        seq_printf(p, "%*d: ", prec, i);
+        for_each_online_cpu(j)
+                seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+        if (desc->irq_data.chip) {
+                if (desc->irq_data.chip->irq_print_chip)
+                        desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
+                else if (desc->irq_data.chip->name)
+                        seq_printf(p, " %8s", desc->irq_data.chip->name);
+                else
+                        seq_printf(p, " %8s", "-");
+        } else {
+                seq_printf(p, " %8s", "None");
+        }
+#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
+        seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+#endif
+        if (desc->name)
+                seq_printf(p, "-%-8s", desc->name);
+        if (action) {
+                seq_printf(p, "  %s", action->name);
+                while ((action = action->next) != NULL)
+                        seq_printf(p, ", %s", action->name);
+        }
+        seq_putc(p, '\n');
+out:
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        return 0;
+}
+#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
 #ifdef CONFIG_HARDIRQS_SW_RESEND
 /* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
 /*
 * Run software resends of IRQ's
@@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
 */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-        unsigned int status = desc->status;
-        /*
-         * Make sure the interrupt is enabled, before resending it:
-         */
-        desc->irq_data.chip->irq_enable(&desc->irq_data);
        /*
         * We do not resend level type interrupts. Level type
         * interrupts are resent by hardware when they are still
         * active.
         */
-        if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+        if (irq_settings_is_level(desc))
-                desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+                return;
+        if (desc->istate & IRQS_REPLAY)
+                return;
+        if (desc->istate & IRQS_PENDING) {
+                desc->istate &= ~IRQS_PENDING;
+                desc->istate |= IRQS_REPLAY;
                if (!desc->irq_data.chip->irq_retrigger ||
                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..0d91730b6330
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,125 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+        _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
+        _IRQ_PER_CPU            = IRQ_PER_CPU,
+        _IRQ_LEVEL              = IRQ_LEVEL,
+        _IRQ_NOPROBE            = IRQ_NOPROBE,
+        _IRQ_NOREQUEST          = IRQ_NOREQUEST,
+        _IRQ_NOAUTOEN           = IRQ_NOAUTOEN,
+        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
+        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
+        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
+        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
+};
+#define IRQ_PER_CPU             GOT_YOU_MORON
+#define IRQ_NO_BALANCING        GOT_YOU_MORON
+#define IRQ_LEVEL               GOT_YOU_MORON
+#define IRQ_NOPROBE             GOT_YOU_MORON
+#define IRQ_NOREQUEST           GOT_YOU_MORON
+#define IRQ_NOAUTOEN            GOT_YOU_MORON
+#define IRQ_NESTED_THREAD       GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK        GOT_YOU_MORON
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+        desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
+        desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
+}
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_PER_CPU;
+}
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_PER_CPU;
+}
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NO_BALANCING;
+}
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_NO_BALANCING;
+}
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
+}
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+        desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
+        desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
+}
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_LEVEL;
+}
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_LEVEL;
+}
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_LEVEL;
+}
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOREQUEST);
+}
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_NOREQUEST;
+}
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NOREQUEST;
+}
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOPROBE);
+}
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_NOPROBE;
+}
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NOPROBE;
+}
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
+}
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
+}
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_NESTED_THREAD;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dfbd550401b2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,93 @@ static int irqfixup __read_mostly;
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 static void poll_spurious_irqs(unsigned long dummy);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
+/*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+        if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+                      "irq poll in progress on cpu %d for irq %d\n",
+                      smp_processor_id(), desc->irq_data.irq))
+                return false;
+#ifdef CONFIG_SMP
+        do {
+                raw_spin_unlock(&desc->lock);
+                while (irqd_irq_inprogress(&desc->irq_data))
+                        cpu_relax();
+                raw_spin_lock(&desc->lock);
+        } while (irqd_irq_inprogress(&desc->irq_data));
+        /* Might have been disabled in meantime */
+        return !irqd_irq_disabled(&desc->irq_data) && desc->action;
+#else
+        return false;
+#endif
+}
 /*
 * Recovery handler for misrouted interrupts.
 */
-static int try_one_irq(int irq, struct irq_desc *desc)
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
+        irqreturn_t ret = IRQ_NONE;
        struct irqaction *action;
-        int ok = 0, work = 0;
        raw_spin_lock(&desc->lock);
-        /* Already running on another processor */
-        if (desc->status & IRQ_INPROGRESS) {
-                /*
-                 * Already running: If it is shared get the other
-                 * CPU to go looking for our mystery interrupt too
-                 */
-                if (desc->action && (desc->action->flags & IRQF_SHARED))
-                        desc->status |= IRQ_PENDING;
-                raw_spin_unlock(&desc->lock);
-                return ok;
-        }
-        /* Honour the normal IRQ locking */
-        desc->status |= IRQ_INPROGRESS;
-        action = desc->action;
-        raw_spin_unlock(&desc->lock);
-        while (action) {
+        /* PER_CPU and nested thread interrupts are never polled */
-                /* Only shared IRQ handlers are safe to call */
+        if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
-                if (action->flags & IRQF_SHARED) {
+                goto out;
-                        if (action->handler(irq, action->dev_id) ==
-                                IRQ_HANDLED)
-                                ok = 1;
-                }
-                action = action->next;
-        }
-        local_irq_disable();
-        /* Now clean up the flags */
-        raw_spin_lock(&desc->lock);
-        action = desc->action;
        /*
-         * While we were looking for a fixup someone queued a real
+         * Do not poll disabled interrupts unless the spurious
-         * IRQ clashing with our walk:
+         * disabled poller asks explicitely.
         */
-        while ((desc->status & IRQ_PENDING) && action) {
+        if (irqd_irq_disabled(&desc->irq_data) && !force)
+                goto out;
+        /*
+         * All handlers must agree on IRQF_SHARED, so we test just the
+         * first. Check for action->next as well.
+         */
+        action = desc->action;
+        if (!action || !(action->flags & IRQF_SHARED) ||
+            (action->flags & __IRQF_TIMER) || !action->next)
+                goto out;
+        /* Already running on another processor */
+        if (irqd_irq_inprogress(&desc->irq_data)) {
                /*
-                 * Perform real IRQ processing for the IRQ we deferred
+                 * Already running: If it is shared get the other
+                 * CPU to go looking for our mystery interrupt too
                 */
-                work = 1;
+                desc->istate |= IRQS_PENDING;
-                raw_spin_unlock(&desc->lock);
+                goto out;
-                handle_IRQ_event(irq, action);
-                raw_spin_lock(&desc->lock);
-                desc->status &= ~IRQ_PENDING;
        }
-        desc->status &= ~IRQ_INPROGRESS;
-        /*
-         * If we did actual work for the real IRQ line we must let the
-         * IRQ controller clean up too
-         */
-        if (work)
-                irq_end(irq, desc);
-        raw_spin_unlock(&desc->lock);
-        return ok;
+        /* Mark it poll in progress */
+        desc->istate |= IRQS_POLL_INPROGRESS;
+        do {
+                if (handle_irq_event(desc) == IRQ_HANDLED)
+                        ret = IRQ_HANDLED;
+                action = desc->action;
+        } while ((desc->istate & IRQS_PENDING) && action);
+        desc->istate &= ~IRQS_POLL_INPROGRESS;
+out:
+        raw_spin_unlock(&desc->lock);
+        return ret == IRQ_HANDLED;
 }
 static int misrouted_irq(int irq)
@@ -92,6 +115,11 @@ static int misrouted_irq(int irq)
        struct irq_desc *desc;
        int i, ok = 0;
+        if (atomic_inc_return(&irq_poll_active) == 1)
+                goto out;
+        irq_poll_cpu = smp_processor_id();
        for_each_irq_desc(i, desc) {
                if (!i)
                         continue;
@@ -99,9 +127,11 @@ static int misrouted_irq(int irq)
                if (i == irq)   /* Already tried */
                        continue;
-                if (try_one_irq(i, desc))
+                if (try_one_irq(i, desc, false))
                        ok = 1;
        }
+out:
+        atomic_dec(&irq_poll_active);
        /* So the caller can adjust the irq error counts */
        return ok;
 }
@@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy)
        struct irq_desc *desc;
        int i;
+        if (atomic_inc_return(&irq_poll_active) != 1)
+                goto out;
+        irq_poll_cpu = smp_processor_id();
        for_each_irq_desc(i, desc) {
-                unsigned int status;
+                unsigned int state;
                if (!i)
                         continue;
                /* Racy but it doesn't matter */
-                status = desc->status;
+                state = desc->istate;
                barrier();
-                if (!(status & IRQ_SPURIOUS_DISABLED))
+                if (!(state & IRQS_SPURIOUS_DISABLED))
                        continue;
                local_irq_disable();
-                try_one_irq(i, desc);
+                try_one_irq(i, desc, true);
                local_irq_enable();
        }
+out:
+        atomic_dec(&irq_poll_active);
        mod_timer(&poll_spurious_irq_timer,
                  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
@@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy)
 *
 * (The other 100-of-100,000 interrupts may have been a correctly
 *  functioning device sharing an IRQ with the failing one)
- *
- * Called under desc->lock
 */
 static void
 __report_bad_irq(unsigned int irq, struct irq_desc *desc,
                 irqreturn_t action_ret)
 {
        struct irqaction *action;
+        unsigned long flags;
        if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
                printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        dump_stack();
        printk(KERN_ERR "handlers:\n");
+        /*
+         * We need to take desc->lock here. note_interrupt() is called
+         * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+         * with something else removing an action. It's ok to take
+         * desc->lock here. See synchronize_irq().
+         */
+        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        while (action) {
                printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
                printk("\n");
                action = action->next;
        }
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 static void
@@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
                    irqreturn_t action_ret)
 {
+        if (desc->istate & IRQS_POLL_INPROGRESS)
+                return;
        if (unlikely(action_ret != IRQ_HANDLED)) {
                /*
                 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                 * Now kill the IRQ
                 */
                printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-                desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+                desc->istate |= IRQS_SPURIOUS_DISABLED;
                desc->depth++;
-                desc->irq_data.chip->irq_disable(&desc->irq_data);
+                irq_disable(desc);
                mod_timer(&poll_spurious_irq_timer,
                          jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
        if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
            arch_is_kernel_text(addr))
                return 1;
-        return in_gate_area_no_task(addr);
+        return in_gate_area_no_mm(addr);
 }
 static inline int is_kernel(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
                return 1;
-        return in_gate_area_no_task(addr);
+        return in_gate_area_no_mm(addr);
 }
 static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 }
 /* Look up a kernel symbol and return it in a text buffer. */
-int sprint_symbol(char *buffer, unsigned long address)
+static int __sprint_symbol(char *buffer, unsigned long address,
+                           int symbol_offset)
 {
        char *modname;
        const char *name;
        unsigned long offset, size;
        int len;
+        address += symbol_offset;
        name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
        if (!name)
                return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
                strcpy(buffer, name);
        len = strlen(buffer);
        buffer += len;
+        offset -= symbol_offset;
        if (modname)
-                len += sprintf(buffer, "+%#lx/%#lx [%s]",
+                len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
-                                                offset, size, modname);
        else
                len += sprintf(buffer, "+%#lx/%#lx", offset, size);
        return len;
 }
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, 0);
+}
 EXPORT_SYMBOL_GPL(sprint_symbol);
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, -1);
+}
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
 {
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
                 */
                type = iter->exported ? toupper(iter->type) :
                                        tolower(iter->type);
-                seq_printf(m, "%0*lx %c %s\t[%s]\n",
+                seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
-                           (int)(2 * sizeof(void *)),
+                           type, iter->name, iter->module_name);
-                           iter->value, type, iter->name, iter->module_name);
        } else
-                seq_printf(m, "%0*lx %c %s\n",
+                seq_printf(m, "%pK %c %s\n", (void *)iter->value,
-                           (int)(2 * sizeof(void *)),
+                           iter->type, iter->name);
-                           iter->value, iter->type, iter->name);
        return 0;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..87b77de03dd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        /* Initialize the list of destination pages */
        INIT_LIST_HEAD(&image->dest_pages);
-        /* Initialize the list of unuseable pages */
+        /* Initialize the list of unusable pages */
        INIT_LIST_HEAD(&image->unuseable_pages);
        /* Read in the segments */
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
        /* Deal with the destination pages I have inadvertently allocated.
         *
         * Ideally I would convert multi-page allocations into single
-         * page allocations, and add everyting to image->dest_pages.
+         * page allocations, and add everything to image->dest_pages.
         *
         * For now it is simpler to just free the pages.
         */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
        /* Walk through and free any extra destination pages I may have */
        kimage_free_page_list(&image->dest_pages);
-        /* Walk through and free any unuseable pages I have cached */
+        /* Walk through and free any unusable pages I have cached */
        kimage_free_page_list(&image->unuseable_pages);
 }
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
        return size;
 }
-static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                           unsigned long end)
 {
        unsigned long addr;
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
        start = roundup(start, PAGE_SIZE);
        end = roundup(start + new_size, PAGE_SIZE);
-        free_reserved_phys_range(end, crashk_res.end);
+        crash_free_reserved_phys_range(end, crashk_res.end);
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
@@ -1531,6 +1533,11 @@ int kernel_kexec(void)
                local_irq_disable();
                /* Suspend system devices */
                error = sysdev_suspend(PMSG_FREEZE);
+                if (!error) {
+                        error = syscore_suspend();
+                        if (error)
+                                sysdev_resume();
+                }
                if (error)
                        goto Enable_irqs;
        } else
@@ -1545,6 +1552,7 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
+                syscore_resume();
                sysdev_resume();
 Enable_irqs:
                local_irq_enable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..3b34d2732bce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
+        int node;
        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
        do_exit(ret);
 }
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+        if (tsk == kthreadd_task)
+                return tsk->pref_node_fork;
+#endif
+        return numa_node_id();
+}
 static void create_kthread(struct kthread_create_info *create)
 {
        int pid;
+#ifdef CONFIG_NUMA
+        current->pref_node_fork = create->node;
+#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
 }
 /**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
+ * @node: memory node number.
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().
 *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
+ * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM).
 */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-                                   void *data,
+                                           void *data,
-                                   const char namefmt[],
+                                           int node,
-                                   ...)
+                                           const char namefmt[],
+                                           ...)
 {
        struct kthread_create_info create;
        create.threadfn = threadfn;
        create.data = data;
+        create.node = node;
        init_completion(&create.done);
        spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        }
        return create.result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
 /**
 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
 }
 /**
- * __account_scheduler_latency - record an occured latency
+ * __account_scheduler_latency - record an occurred latency
 * @tsk - the task struct of the task hitting the latency
 * @usecs - the duration of the latency in microseconds
 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..53a68956f131 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
        if (unlikely(curr->hardirqs_enabled)) {
                /*
                 * Neither irq nor preemption are disabled here
-                 * so this is racy by nature but loosing one hit
+                 * so this is racy by nature but losing one hit
                 * in a stat is not a big deal.
                 */
                __debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        if (!graph_lock())
                return 0;
        /*
-         * Make sure we didnt race:
+         * Make sure we didn't race:
         */
        if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
                graph_unlock();
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                      nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
                      nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
                      nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
-                      sum_forward_deps = 0, factor = 0;
+                      sum_forward_deps = 0;
        list_for_each_entry(class, &all_lock_classes, lock_entry) {
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                        nr_hardirq_unsafe * nr_hardirq_safe +
                        nr_list_entries);
-        /*
-         * Estimated factor between direct and indirect
-         * dependencies:
-         */
-        if (nr_list_entries)
-                factor = sum_forward_deps / nr_list_entries;
 #ifdef CONFIG_PROVE_LOCKING
        seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
                        nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..d5938a5c19c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                wait_for_zero_refcount(mod);
        mutex_unlock(&module_mutex);
-        /* Final destruction now noone is using it. */
+        /* Final destruction now no one is using it. */
        if (mod->exit != NULL)
                mod->exit();
        blocking_notifier_call_chain(&module_notify_list,
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 {
        struct module_sect_attr *sattr =
                container_of(mattr, struct module_sect_attr, mattr);
-        return sprintf(buf, "0x%lx\n", sattr->address);
+        return sprintf(buf, "0x%pK\n", (void *)sattr->address);
 }
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod,
        mod->state = MODULE_STATE_COMING;
        /* Now sew it into the lists so we can get lockdep and oops
-         * info during argument parsing.  Noone should access us, since
+         * info during argument parsing.  No one should access us, since
         * strong_try_module_get() will fail.
         * lockdep/oops can run asynchronous, so use the RCU list insertion
         * function to insert in a way safe to concurrent readers.
@@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod,
        else
                nextval = (unsigned long)mod->module_core+mod->core_text_size;
-        /* Scan for closest preceeding symbol, and next symbol. (ELF
+        /* Scan for closest preceding symbol, and next symbol. (ELF
           starts real symbols at 1). */
        for (i = 1; i < mod->num_symtab; i++) {
                if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
                   mod->state == MODULE_STATE_COMING ? "Loading":
                   "Live");
        /* Used by oprofile and other similar tools. */
-        seq_printf(m, " 0x%p", mod->module_core);
+        seq_printf(m, " 0x%pK", mod->module_core);
        /* Taints info */
        if (mod->taints)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..c4195fa98900 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                }
                __set_task_state(task, state);
-                /* didnt get the lock, go to sleep: */
+                /* didn't get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
                preempt_enable_no_resched();
                schedule();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
                goto out_ns;
        }
-        new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+        new_nsp->uts_ns = copy_utsname(flags, tsk);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }
-        new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+        new_nsp->ipc_ns = copy_ipcs(flags, tsk);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
                /*
                 * This cpu has to do the parallel processing of the next
                 * object. It's waiting in the cpu's parallelization queue,
-                 * so exit imediately.
+                 * so exit immediately.
                 */
                if (PTR_ERR(padata) == -ENODATA) {
                        del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
        /*
         * The next object that needs serialization might have arrived to
         * the reorder queues in the meantime, we will be called again
-         * from the timer function if noone else cares for it.
+         * from the timer function if no one else cares for it.
         */
        if (atomic_read(&pd->reorder_objects)
                        && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
        put_online_cpus();
 }
-/* Replace the internal control stucture with a new one. */
+/* Replace the internal control structure with a new one. */
 static void padata_replace(struct padata_instance *pinst,
                           struct parallel_data *pd_new)
 {
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 }
 /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
 *                     padata cpumasks.
 *
 * @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init oops_setup(char *s)
+{
+        if (!s)
+                return -EINVAL;
+        if (!strcmp(s, "panic"))
+                panic_on_oops = 1;
+        return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..7ab388a48a2e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
-                        /* Noone handled NULL, so do it here. */
+                        /* No one handled NULL, so do it here. */
                        if (!val && params[i].ops->set != param_set_bool)
                                return -EINVAL;
                        DEBUGP("They are equal!  Calling %p\n",
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 999835b6112b..8e81a9860a0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
 #include <asm/irq_regs.h>
+struct remote_function_call {
+        struct task_struct *p;
+        int (*func)(void *info);
+        void *info;
+        int ret;
+};
+static void remote_function(void *data)
+{
+        struct remote_function_call *tfc = data;
+        struct task_struct *p = tfc->p;
+        if (p) {
+                tfc->ret = -EAGAIN;
+                if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+                        return;
+        }
+        tfc->ret = tfc->func(tfc->info);
+}
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p:          the task to evaluate
+ * @func:       the function to be called
+ * @info:       the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ *          -ESRCH  - when the process isn't running
+ *          -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+        struct remote_function_call data = {
+                .p = p,
+                .func = func,
+                .info = info,
+                .ret = -ESRCH, /* No such (running) process */
+        };
+        if (task_curr(p))
+                smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+        return data.ret;
+}
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func:       the function to be called
+ * @info:       the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+        struct remote_function_call data = {
+                .p = NULL,
+                .func = func,
+                .info = info,
+                .ret = -ENXIO, /* No such CPU */
+        };
+        smp_call_function_single(cpu, remote_function, &data, 1);
+        return data.ret;
+}
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+                       PERF_FLAG_FD_OUTPUT  |\
+                       PERF_FLAG_PID_CGROUP)
 enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 /*
 * max perf event sample rate
 */
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+int perf_proc_update_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
+        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+        return 0;
+}
 static atomic64_t perf_event_id;
@@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type);
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type);
+                             enum event_type_t event_type,
+                             struct task_struct *task);
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
 void __weak perf_event_print_debug(void)        { }
@@ -89,6 +194,361 @@ static inline u64 perf_clock(void)
        return local_clock();
 }
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+#ifdef CONFIG_CGROUP_PERF
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+        return container_of(task_subsys_state(task, perf_subsys_id),
+                        struct perf_cgroup, css);
+}
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+        css_get(&event->cgrp->css);
+}
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+        css_put(&event->cgrp->css);
+}
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+        perf_put_cgroup(event);
+        event->cgrp = NULL;
+}
+static inline int is_cgroup_event(struct perf_event *event)
+{
+        return event->cgrp != NULL;
+}
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+        struct perf_cgroup_info *t;
+        t = per_cpu_ptr(event->cgrp->info, event->cpu);
+        return t->time;
+}
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+        struct perf_cgroup_info *info;
+        u64 now;
+        now = perf_clock();
+        info = this_cpu_ptr(cgrp->info);
+        info->time += now - info->timestamp;
+        info->timestamp = now;
+}
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+        struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+        if (cgrp_out)
+                __update_cgrp_time(cgrp_out);
+}
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+        struct perf_cgroup *cgrp;
+        /*
+         * ensure we access cgroup data only when needed and
+         * when we know the cgroup is pinned (css_get)
+         */
+        if (!is_cgroup_event(event))
+                return;
+        cgrp = perf_cgroup_from_task(current);
+        /*
+         * Do not update time when cgroup is not active
+         */
+        if (cgrp == event->cgrp)
+                __update_cgrp_time(event->cgrp);
+}
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+                          struct perf_event_context *ctx)
+{
+        struct perf_cgroup *cgrp;
+        struct perf_cgroup_info *info;
+        /*
+         * ctx->lock held by caller
+         * ensure we do not access cgroup data
+         * unless we have the cgroup pinned (css_get)
+         */
+        if (!task || !ctx->nr_cgroups)
+                return;
+        cgrp = perf_cgroup_from_task(task);
+        info = this_cpu_ptr(cgrp->info);
+        info->timestamp = ctx->timestamp;
+}
+#define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        /*
+         * disable interrupts to avoid geting nr_cgroup
+         * changes via __perf_event_disable(). Also
+         * avoids preemption.
+         */
+        local_irq_save(flags);
+        /*
+         * we reschedule only in the presence of cgroup
+         * constrained events.
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                perf_pmu_disable(cpuctx->ctx.pmu);
+                /*
+                 * perf_cgroup_events says at least one
+                 * context on this CPU has cgroup events.
+                 *
+                 * ctx->nr_cgroups reports the number of cgroup
+                 * events for a context.
+                 */
+                if (cpuctx->ctx.nr_cgroups > 0) {
+                        if (mode & PERF_CGROUP_SWOUT) {
+                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                                /*
+                                 * must not be done before ctxswout due
+                                 * to event_filter_match() in event_sched_out()
+                                 */
+                                cpuctx->cgrp = NULL;
+                        }
+                        if (mode & PERF_CGROUP_SWIN) {
+                                WARN_ON_ONCE(cpuctx->cgrp);
+                                /* set cgrp before ctxsw in to
+                                 * allow event_filter_match() to not
+                                 * have to pass task around
+                                 */
+                                cpuctx->cgrp = perf_cgroup_from_task(task);
+                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+                        }
+                }
+                perf_pmu_enable(cpuctx->ctx.pmu);
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+        perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+        perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+                                      struct perf_event_attr *attr,
+                                      struct perf_event *group_leader)
+{
+        struct perf_cgroup *cgrp;
+        struct cgroup_subsys_state *css;
+        struct file *file;
+        int ret = 0, fput_needed;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        css = cgroup_css_from_dir(file, perf_subsys_id);
+        if (IS_ERR(css)) {
+                ret = PTR_ERR(css);
+                goto out;
+        }
+        cgrp = container_of(css, struct perf_cgroup, css);
+        event->cgrp = cgrp;
+        /* must be done before we fput() the file */
+        perf_get_cgroup(event);
+        /*
+         * all events in a group must monitor
+         * the same cgroup because a task belongs
+         * to only one perf cgroup at a time
+         */
+        if (group_leader && group_leader->cgrp != cgrp) {
+                perf_detach_cgroup(event);
+                ret = -EINVAL;
+        }
+out:
+        fput_light(file, fput_needed);
+        return ret;
+}
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+        struct perf_cgroup_info *t;
+        t = per_cpu_ptr(event->cgrp->info, event->cpu);
+        event->shadow_ctx_time = now - t->timestamp;
+}
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+        /*
+         * when the current task's perf cgroup does not match
+         * the event's, we need to remember to call the
+         * perf_mark_enable() function the first time a task with
+         * a matching perf cgroup is scheduled in.
+         */
+        if (is_cgroup_event(event) && !perf_cgroup_match(event))
+                event->cgrp_defer_enabled = 1;
+}
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+                         struct perf_event_context *ctx)
+{
+        struct perf_event *sub;
+        u64 tstamp = perf_event_time(event);
+        if (!event->cgrp_defer_enabled)
+                return;
+        event->cgrp_defer_enabled = 0;
+        event->tstamp_enabled = tstamp - event->total_time_enabled;
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                        sub->cgrp_defer_enabled = 0;
+                }
+        }
+}
+#else /* !CONFIG_CGROUP_PERF */
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+        return true;
+}
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+static inline int is_cgroup_event(struct perf_event *event)
+{
+        return 0;
+}
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+        return 0;
+}
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+                                      struct perf_event_attr *attr,
+                                      struct perf_event *group_leader)
+{
+        return -EINVAL;
+}
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+                          struct perf_event_context *ctx)
+{
+}
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+        return 0;
+}
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+                         struct perf_event_context *ctx)
+{
+}
+#endif
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +714,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
-        put_ctx(ctx);
 }
 /*
@@ -271,6 +730,10 @@ static void update_context_time(struct perf_event_context *ctx)
 static u64 perf_event_time(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
+        if (is_cgroup_event(event))
+                return perf_cgroup_event_time(event);
        return ctx ? ctx->time : 0;
 }
@@ -285,9 +748,20 @@ static void update_event_times(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE ||
            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
                return;
+        /*
-        if (ctx->is_active)
+         * in cgroup mode, time_enabled represents
+         * the time the event was enabled AND active
+         * tasks were in the monitored cgroup. This is
+         * independent of the activity of the context as
+         * there may be a mix of cgroup and non-cgroup events.
+         *
+         * That is why we treat cgroup events differently
+         * here.
+         */
+        if (is_cgroup_event(event))
                run_end = perf_event_time(event);
+        else if (ctx->is_active)
+                run_end = ctx->time;
        else
                run_end = event->tstamp_stopped;
@@ -299,6 +773,7 @@ static void update_event_times(struct perf_event *event)
                run_end = perf_event_time(event);
        event->total_time_running = run_end - event->tstamp_running;
 }
 /*
@@ -347,6 +822,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                list_add_tail(&event->group_entry, list);
        }
+        if (is_cgroup_event(event))
+                ctx->nr_cgroups++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
        if (!ctx->nr_events)
                perf_pmu_rotate_start(ctx->pmu);
@@ -465,6 +943,7 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+        struct perf_cpu_context *cpuctx;
        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
@@ -473,6 +952,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        event->attach_state &= ~PERF_ATTACH_CONTEXT;
+        if (is_cgroup_event(event)) {
+                ctx->nr_cgroups--;
+                cpuctx = __get_cpu_context(ctx);
+                /*
+                 * if there are no more cgroup events
+                 * then cler cgrp to avoid stale pointer
+                 * in update_cgrp_time_from_cpuctx()
+                 */
+                if (!ctx->nr_cgroups)
+                        cpuctx->cgrp = NULL;
+        }
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -544,7 +1035,8 @@ out:
 static inline int
 event_filter_match(struct perf_event *event)
 {
-        return event->cpu == -1 || event->cpu == smp_processor_id();
+        return (event->cpu == -1 || event->cpu == smp_processor_id())
+            && perf_cgroup_match(event);
 }
 static void
@@ -562,7 +1054,7 @@ event_sched_out(struct perf_event *event,
         */
        if (event->state == PERF_EVENT_STATE_INACTIVE
            && !event_filter_match(event)) {
-                delta = ctx->time - event->tstamp_stopped;
+                delta = tstamp - event->tstamp_stopped;
                event->tstamp_running += delta;
                event->tstamp_stopped = tstamp;
        }
@@ -606,47 +1098,30 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
 /*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
-static void __perf_event_remove_from_context(void *info)
+static int __perf_remove_from_context(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-        /*
-         * If this is a task context, we need to check whether it is
-         * the current task context of this cpu. If not it has been
-         * scheduled out before the smp call arrived.
-         */
-        if (ctx->task && cpuctx->task_ctx != ctx)
-                return;
        raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
        list_del_event(event, ctx);
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
 * Remove the event from a task's (or a CPU's) list of events.
 *
- * Must be called with ctx->mutex held.
- *
 * CPU events are removed with a smp call. For task events we only
 * call when the task is on a CPU.
 *
@@ -657,49 +1132,48 @@ static void __perf_event_remove_from_context(void *info)
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
-static void perf_event_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
+        lockdep_assert_held(&ctx->mutex);
        if (!task) {
                /*
                 * Per cpu events are removed via an smp call and
                 * the removal is always successful.
                 */
-                smp_call_function_single(event->cpu,
+                cpu_function_call(event->cpu, __perf_remove_from_context, event);
-                                         __perf_event_remove_from_context,
-                                         event, 1);
                return;
        }
 retry:
-        task_oncpu_function_call(task, __perf_event_remove_from_context,
+        if (!task_function_call(task, __perf_remove_from_context, event))
-                                 event);
+                return;
        raw_spin_lock_irq(&ctx->lock);
        /*
-         * If the context is active we need to retry the smp call.
+         * If we failed to find a running task, but find the context active now
+         * that we've acquired the ctx->lock, retry.
         */
-        if (ctx->nr_active && !list_empty(&event->group_entry)) {
+        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
        /*
-         * The lock prevents that this context is scheduled in so we
+         * Since the task isn't running, its safe to remove the event, us
-         * can remove the event safely, if the call above did not
+         * holding the ctx->lock ensures the task won't get scheduled in.
-         * succeed.
         */
-        if (!list_empty(&event->group_entry))
+        list_del_event(event, ctx);
-                list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
 /*
 * Cross CPU call to disable a performance event
 */
-static void __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1182,12 @@ static void __perf_event_disable(void *info)
        /*
         * If this is a per-task event, need to check whether this
         * event's task is the current task on this cpu.
+         *
+         * Can trigger due to concurrent perf_event_context_sched_out()
+         * flipping contexts around.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
-                return;
+                return -EINVAL;
        raw_spin_lock(&ctx->lock);
@@ -720,6 +1197,7 @@ static void __perf_event_disable(void *info)
         */
        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
                update_context_time(ctx);
+                update_cgrp_time_from_event(event);
                update_group_times(event);
                if (event == event->group_leader)
                        group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1207,8 @@ static void __perf_event_disable(void *info)
        }
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
@@ -753,13 +1233,13 @@ void perf_event_disable(struct perf_event *event)
                /*
                 * Disable the event on the cpu that it's on
                 */
-                smp_call_function_single(event->cpu, __perf_event_disable,
+                cpu_function_call(event->cpu, __perf_event_disable, event);
-                                         event, 1);
                return;
        }
 retry:
-        task_oncpu_function_call(task, __perf_event_disable, event);
+        if (!task_function_call(task, __perf_event_disable, event))
+                return;
        raw_spin_lock_irq(&ctx->lock);
        /*
@@ -767,6 +1247,11 @@ retry:
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
                raw_spin_unlock_irq(&ctx->lock);
+                /*
+                 * Reload the task pointer, it might have been changed by
+                 * a concurrent perf_event_context_sched_out().
+                 */
+                task = ctx->task;
                goto retry;
        }
@@ -778,10 +1263,48 @@ retry:
                update_group_times(event);
                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+static void perf_set_shadow_time(struct perf_event *event,
+                                 struct perf_event_context *ctx,
+                                 u64 tstamp)
+{
+        /*
+         * use the correct time source for the time snapshot
+         *
+         * We could get by without this by leveraging the
+         * fact that to get to this function, the caller
+         * has most likely already called update_context_time()
+         * and update_cgrp_time_xx() and thus both timestamp
+         * are identical (or very close). Given that tstamp is,
+         * already adjusted for cgroup, we could say that:
+         *    tstamp - ctx->timestamp
+         * is equivalent to
+         *    tstamp - cgrp->timestamp.
+         *
+         * Then, in perf_output_read(), the calculation would
+         * work with no changes because:
+         * - event is guaranteed scheduled in
+         * - no scheduled out in between
+         * - thus the timestamp would be the same
+         *
+         * But this is a bit hairy.
+         *
+         * So instead, we have an explicit cgroup call to remain
+         * within the time time source all along. We believe it
+         * is cleaner and simpler to understand.
+         */
+        if (is_cgroup_event(event))
+                perf_cgroup_set_shadow_time(event, tstamp);
+        else
+                event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_event *event, int enable);
 static int
 event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
@@ -794,6 +1317,17 @@ event_sched_in(struct perf_event *event,
        event->state = PERF_EVENT_STATE_ACTIVE;
        event->oncpu = smp_processor_id();
+        /*
+         * Unthrottle events, since we scheduled we might have missed several
+         * ticks already, also for a heavily scheduling task there is little
+         * guarantee it'll get a tick in a timely manner.
+         */
+        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+                perf_log_throttle(event, 1);
+                event->hw.interrupts = 0;
+        }
        /*
         * The new state must be visible before we turn it on in the hardware:
         */
@@ -807,7 +1341,7 @@ event_sched_in(struct perf_event *event,
        event->tstamp_running += tstamp - event->tstamp_stopped;
-        event->shadow_ctx_time = tstamp - ctx->timestamp;
+        perf_set_shadow_time(event, ctx, tstamp);
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
@@ -928,12 +1462,15 @@ static void add_event_to_ctx(struct perf_event *event,
        event->tstamp_stopped = tstamp;
 }
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+                                        struct task_struct *tsk);
 /*
 * Cross CPU call to install and enable a performance event
 *
 * Must be called with ctx->mutex held
 */
-static void __perf_install_in_context(void *info)
+static int  __perf_install_in_context(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
@@ -942,21 +1479,22 @@ static void __perf_install_in_context(void *info)
        int err;
        /*
-         * If this is a task context, we need to check whether it is
+         * In case we're installing a new context to an already running task,
-         * the current task context of this cpu. If not it has been
+         * could also happen before perf_event_task_sched_in() on architectures
-         * scheduled out before the smp call arrived.
+         * which do context switches with IRQs enabled.
-         * Or possibly this is the right context but it isn't
-         * on this cpu because it had no events.
         */
-        if (ctx->task && cpuctx->task_ctx != ctx) {
+        if (ctx->task && !cpuctx->task_ctx)
-                if (cpuctx->task_ctx || ctx->task != current)
+                perf_event_context_sched_in(ctx, ctx->task);
-                        return;
-                cpuctx->task_ctx = ctx;
-        }
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
+        /*
+         * update cgrp time only if current cgrp
+         * matches event->cgrp. Must be done before
+         * calling add_event_to_ctx()
+         */
+        update_cgrp_time_from_event(event);
        add_event_to_ctx(event, ctx);
@@ -997,6 +1535,8 @@ static void __perf_install_in_context(void *info)
 unlock:
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
@@ -1008,8 +1548,6 @@ unlock:
 * If the event is attached to a task which is on a CPU we use a smp
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
 */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
@@ -1018,6 +1556,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
        struct task_struct *task = ctx->task;
+        lockdep_assert_held(&ctx->mutex);
        event->ctx = ctx;
        if (!task) {
@@ -1025,31 +1565,29 @@ perf_install_in_context(struct perf_event_context *ctx,
                 * Per cpu events are installed via an smp call and
                 * the install is always successful.
                 */
-                smp_call_function_single(cpu, __perf_install_in_context,
+                cpu_function_call(cpu, __perf_install_in_context, event);
-                                         event, 1);
                return;
        }
 retry:
-        task_oncpu_function_call(task, __perf_install_in_context,
+        if (!task_function_call(task, __perf_install_in_context, event))
-                                 event);
+                return;
        raw_spin_lock_irq(&ctx->lock);
        /*
-         * we need to retry the smp call.
+         * If we failed to find a running task, but find the context active now
+         * that we've acquired the ctx->lock, retry.
         */
-        if (ctx->is_active && list_empty(&event->group_entry)) {
+        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
        /*
-         * The lock prevents that this context is scheduled in so we
+         * Since the task isn't running, its safe to add the event, us holding
-         * can add the event safely, if it the call above did not
+         * the ctx->lock ensures the task won't get scheduled in.
-         * succeed.
         */
-        if (list_empty(&event->group_entry))
+        add_event_to_ctx(event, ctx);
-                add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -1078,7 +1616,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 /*
 * Cross CPU call to enable a performance event
 */
-static void __perf_event_enable(void *info)
+static int __perf_event_enable(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
@@ -1086,26 +1624,27 @@ static void __perf_event_enable(void *info)
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
-        /*
+        if (WARN_ON_ONCE(!ctx->is_active))
-         * If this is a per-task event, need to check whether this
+                return -EINVAL;
-         * event's task is the current task on this cpu.
-         */
-        if (ctx->task && cpuctx->task_ctx != ctx) {
-                if (cpuctx->task_ctx || ctx->task != current)
-                        return;
-                cpuctx->task_ctx = ctx;
-        }
        raw_spin_lock(&ctx->lock);
-        ctx->is_active = 1;
        update_context_time(ctx);
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                goto unlock;
+        /*
+         * set current task's cgroup time reference point
+         */
+        perf_cgroup_set_timestamp(current, ctx);
        __perf_event_mark_enabled(event, ctx);
-        if (!event_filter_match(event))
+        if (!event_filter_match(event)) {
+                if (is_cgroup_event(event))
+                        perf_cgroup_defer_enabled(event);
                goto unlock;
+        }
        /*
         * If the event is in a group and isn't the group leader,
@@ -1138,6 +1677,8 @@ static void __perf_event_enable(void *info)
 unlock:
        raw_spin_unlock(&ctx->lock);
+        return 0;
 }
 /*
@@ -1158,8 +1699,7 @@ void perf_event_enable(struct perf_event *event)
                /*
                 * Enable the event on the cpu that it's on
                 */
-                smp_call_function_single(event->cpu, __perf_event_enable,
+                cpu_function_call(event->cpu, __perf_event_enable, event);
-                                         event, 1);
                return;
        }
@@ -1178,8 +1718,15 @@ void perf_event_enable(struct perf_event *event)
                event->state = PERF_EVENT_STATE_OFF;
 retry:
+        if (!ctx->is_active) {
+                __perf_event_mark_enabled(event, ctx);
+                goto out;
+        }
        raw_spin_unlock_irq(&ctx->lock);
-        task_oncpu_function_call(task, __perf_event_enable, event);
+        if (!task_function_call(task, __perf_event_enable, event))
+                return;
        raw_spin_lock_irq(&ctx->lock);
@@ -1187,15 +1734,14 @@ retry:
         * If the context is active and the event is still off,
         * we need to retry the cross-call.
         */
-        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+                /*
+                 * task could have been flipped by a concurrent
+                 * perf_event_context_sched_out()
+                 */
+                task = ctx->task;
                goto retry;
+        }
-        /*
-         * Since we have the lock this context can't be scheduled
-         * in, so we can change the state safely.
-         */
-        if (event->state == PERF_EVENT_STATE_OFF)
-                __perf_event_mark_enabled(event, ctx);
 out:
        raw_spin_unlock_irq(&ctx->lock);
@@ -1227,6 +1773,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        if (likely(!ctx->nr_events))
                goto out;
        update_context_time(ctx);
+        update_cgrp_time_from_cpuctx(cpuctx);
        if (!ctx->nr_active)
                goto out;
@@ -1339,8 +1886,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        }
 }
-void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
-                                  struct task_struct *next)
+                                         struct task_struct *next)
 {
        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
        struct perf_event_context *next_ctx;
@@ -1416,6 +1963,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
+        /*
+         * if cgroup events exist on this CPU, then we need
+         * to check if we have to switch out PMU state.
+         * cgroup event are system-wide mode only
+         */
+        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+                perf_cgroup_sched_out(task);
 }
 static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1454,6 +2009,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                /* may need to reset tstamp_enabled */
+                if (is_cgroup_event(event))
+                        perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, 1))
                        group_sched_in(event, cpuctx, ctx);
@@ -1486,6 +2045,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                /* may need to reset tstamp_enabled */
+                if (is_cgroup_event(event))
+                        perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
@@ -1496,15 +2059,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
-             enum event_type_t event_type)
+             enum event_type_t event_type,
+             struct task_struct *task)
 {
+        u64 now;
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
                goto out;
-        ctx->timestamp = perf_clock();
+        now = perf_clock();
+        ctx->timestamp = now;
+        perf_cgroup_set_timestamp(task, ctx);
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
@@ -1521,11 +2088,12 @@ out:
 }
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
+                             enum event_type_t event_type,
+                             struct task_struct *task)
 {
        struct perf_event_context *ctx = &cpuctx->ctx;
-        ctx_sched_in(ctx, cpuctx, event_type);
+        ctx_sched_in(ctx, cpuctx, event_type, task);
 }
 static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1533,15 +2101,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 {
        struct perf_cpu_context *cpuctx;
-        cpuctx = __get_cpu_context(ctx);
+        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx)
                return;
-        ctx_sched_in(ctx, cpuctx, event_type);
+        ctx_sched_in(ctx, cpuctx, event_type, NULL);
        cpuctx->task_ctx = ctx;
 }
-void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+                                        struct task_struct *task)
 {
        struct perf_cpu_context *cpuctx;
@@ -1557,9 +2126,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+        ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
-        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
        cpuctx->task_ctx = ctx;
@@ -1592,14 +2161,17 @@ void __perf_event_task_sched_in(struct task_struct *task)
                if (likely(!ctx))
                        continue;
-                perf_event_context_sched_in(ctx);
+                perf_event_context_sched_in(ctx, task);
        }
+        /*
+         * if cgroup events exist on this CPU, then we need
+         * to check if we have to switch in PMU state.
+         * cgroup event are system-wide mode only
+         */
+        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+                perf_cgroup_sched_in(task);
 }
-#define MAX_INTERRUPTS (~0ULL)
-static void perf_log_throttle(struct perf_event *event, int enable);
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
        u64 frequency = event->attr.sample_freq;
@@ -1627,7 +2199,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
-#define REDUCE_FLS(a, b)                \
+#define REDUCE_FLS(a, b)                \
 do {                                    \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
@@ -1797,7 +2369,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
        if (ctx)
                rotate_ctx(ctx);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
        if (ctx)
                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
@@ -1852,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        if (!ctx || !ctx->nr_events)
                goto out;
+        /*
+         * We must ctxsw out cgroup events to avoid conflict
+         * when invoking perf_task_event_sched_in() later on
+         * in this function. Otherwise we end up trying to
+         * ctxswin cgroup events which are already scheduled
+         * in.
+         */
+        perf_cgroup_sched_out(current);
        task_ctx_sched_out(ctx, EVENT_ALL);
        raw_spin_lock(&ctx->lock);
@@ -1876,7 +2456,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_unlock(&ctx->lock);
-        perf_event_context_sched_in(ctx);
+        /*
+         * Also calls ctxswin for cgroup events, if any:
+         */
+        perf_event_context_sched_in(ctx, ctx->task);
 out:
        local_irq_restore(flags);
 }
@@ -1901,8 +2484,10 @@ static void __perf_event_read(void *info)
                return;
        raw_spin_lock(&ctx->lock);
-        if (ctx->is_active)
+        if (ctx->is_active) {
                update_context_time(ctx);
+                update_cgrp_time_from_event(event);
+        }
        update_event_times(event);
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);
@@ -1933,8 +2518,10 @@ static u64 perf_event_read(struct perf_event *event)
                 * (e.g., thread is blocked), in that case
                 * we cannot update context time
                 */
-                if (ctx->is_active)
+                if (ctx->is_active) {
                        update_context_time(ctx);
+                        update_cgrp_time_from_event(event);
+                }
                update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -2213,6 +2800,9 @@ errout:
 }
+/*
+ * Returns a matching context with refcount and pincount.
+ */
 static struct perf_event_context *
 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 {
@@ -2237,6 +2827,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
+                ++ctx->pin_count;
                return ctx;
        }
@@ -2250,6 +2841,7 @@ retry:
        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                unclone_ctx(ctx);
+                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -2271,8 +2863,10 @@ retry:
                        err = -ESRCH;
                else if (task->perf_event_ctxp[ctxn])
                        err = -EAGAIN;
-                else
+                else {
+                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+                }
                mutex_unlock(&task->perf_event_mutex);
                if (unlikely(err)) {
@@ -2312,7 +2906,7 @@ static void free_event(struct perf_event *event)
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_dec(&perf_task_events);
+                        jump_label_dec(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2321,6 +2915,10 @@ static void free_event(struct perf_event *event)
                        atomic_dec(&nr_task_events);
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
+                if (is_cgroup_event(event)) {
+                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+                        jump_label_dec(&perf_sched_events);
+                }
        }
        if (event->buffer) {
@@ -2328,6 +2926,9 @@ static void free_event(struct perf_event *event)
                event->buffer = NULL;
        }
+        if (is_cgroup_event(event))
+                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
@@ -4395,26 +4996,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (unlikely(!is_sampling_event(event)))
                return 0;
-        if (!throttle) {
+        if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
-                hwc->interrupts++;
+                if (throttle) {
-        } else {
+                        hwc->interrupts = MAX_INTERRUPTS;
-                if (hwc->interrupts != MAX_INTERRUPTS) {
+                        perf_log_throttle(event, 0);
-                        hwc->interrupts++;
-                        if (HZ * hwc->interrupts >
-                                        (u64)sysctl_perf_event_sample_rate) {
-                                hwc->interrupts = MAX_INTERRUPTS;
-                                perf_log_throttle(event, 0);
-                                ret = 1;
-                        }
-                } else {
-                        /*
-                         * Keep re-disabling events even though on the previous
-                         * pass we disabled it - just in case we raced with a
-                         * sched-in and the event got enabled again:
-                         */
                        ret = 1;
                }
-        }
+        } else
+                hwc->interrupts++;
        if (event->attr.freq) {
                u64 now = perf_clock();
@@ -4556,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
        if (event->hw.state & PERF_HES_STOPPED)
-                return 0;
+                return 1;
        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
@@ -4912,6 +5501,8 @@ static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        /*
         * All tracepoints are from kernel-space.
         */
@@ -5051,6 +5642,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        u64 period;
        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+        if (event->state != PERF_EVENT_STATE_ACTIVE)
+                return HRTIMER_NORESTART;
        event->pmu->read(event);
        perf_sample_data_init(&data, 0);
@@ -5077,9 +5672,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
        if (!is_sampling_event(event))
                return;
-        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hwc->hrtimer.function = perf_swevent_hrtimer;
        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
@@ -5106,6 +5698,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
        }
 }
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        if (!is_sampling_event(event))
+                return;
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swevent_hrtimer;
+        /*
+         * Since hrtimers have a fixed rate, we can do a static freq->period
+         * mapping and avoid the whole period adjust feedback stuff.
+         */
+        if (event->attr.freq) {
+                long freq = event->attr.sample_freq;
+                event->attr.sample_period = NSEC_PER_SEC / freq;
+                hwc->sample_period = event->attr.sample_period;
+                local64_set(&hwc->period_left, hwc->sample_period);
+                event->attr.freq = 0;
+        }
+}
 /*
 * Software event: cpu wall time clock
 */
@@ -5158,6 +5774,8 @@ static int cpu_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;
+        perf_swevent_init_hrtimer(event);
        return 0;
 }
@@ -5213,16 +5831,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
 static void task_clock_event_read(struct perf_event *event)
 {
-        u64 time;
+        u64 now = perf_clock();
+        u64 delta = now - event->ctx->timestamp;
-        if (!in_nmi()) {
+        u64 time = event->ctx->time + delta;
-                update_context_time(event->ctx);
-                time = event->ctx->time;
-        } else {
-                u64 now = perf_clock();
-                u64 delta = now - event->ctx->timestamp;
-                time = event->ctx->time + delta;
-        }
        task_clock_event_update(event, time);
 }
@@ -5235,6 +5846,8 @@ static int task_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;
+        perf_swevent_init_hrtimer(event);
        return 0;
 }
@@ -5506,17 +6119,22 @@ struct pmu *perf_init_event(struct perf_event *event)
 {
        struct pmu *pmu = NULL;
        int idx;
+        int ret;
        idx = srcu_read_lock(&pmus_srcu);
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
-        if (pmu)
+        if (pmu) {
+                ret = pmu->event_init(event);
+                if (ret)
+                        pmu = ERR_PTR(ret);
                goto unlock;
+        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                int ret = pmu->event_init(event);
+                ret = pmu->event_init(event);
                if (!ret)
                        goto unlock;
@@ -5642,7 +6260,7 @@ done:
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_inc(&perf_task_events);
+                        jump_label_inc(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -5817,7 +6435,7 @@ SYSCALL_DEFINE5(perf_event_open,
        int err;
        /* for future expandability... */
-        if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;
        err = perf_copy_attr(attr_uptr, &attr);
@@ -5834,6 +6452,15 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
+        /*
+         * In cgroup mode, the pid argument is used to pass the fd
+         * opened to the cgroup directory in cgroupfs. The cpu argument
+         * designates the cpu on which to monitor threads from that
+         * cgroup.
+         */
+        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+                return -EINVAL;
        event_fd = get_unused_fd_flags(O_RDWR);
        if (event_fd < 0)
                return event_fd;
@@ -5851,7 +6478,7 @@ SYSCALL_DEFINE5(perf_event_open,
                        group_leader = NULL;
        }
-        if (pid != -1) {
+        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
@@ -5865,6 +6492,19 @@ SYSCALL_DEFINE5(perf_event_open,
                goto err_task;
        }
+        if (flags & PERF_FLAG_PID_CGROUP) {
+                err = perf_cgroup_connect(pid, event, &attr, group_leader);
+                if (err)
+                        goto err_alloc;
+                /*
+                 * one more event:
+                 * - that has cgroup constraint on event->cpu
+                 * - that may need work on context switch
+                 */
+                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+                jump_label_inc(&perf_sched_events);
+        }
        /*
         * Special case software events and allow them to be part of
         * any hardware group.
@@ -5903,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open,
                goto err_alloc;
        }
+        if (task) {
+                put_task_struct(task);
+                task = NULL;
+        }
        /*
         * Look up the group leader (we will attach this event to it):
         */
@@ -5950,10 +6595,10 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_context *gctx = group_leader->ctx;
                mutex_lock(&gctx->mutex);
-                perf_event_remove_from_context(group_leader);
+                perf_remove_from_context(group_leader);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_event_remove_from_context(sibling);
+                        perf_remove_from_context(sibling);
                        put_ctx(gctx);
                }
                mutex_unlock(&gctx->mutex);
@@ -5976,6 +6621,7 @@ SYSCALL_DEFINE5(perf_event_open,
        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
+        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
        event->owner = current;
@@ -6001,6 +6647,7 @@ SYSCALL_DEFINE5(perf_event_open,
        return event_fd;
 err_context:
+        perf_unpin_context(ctx);
        put_ctx(ctx);
 err_alloc:
        free_event(event);
@@ -6051,6 +6698,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
+        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
        return event;
@@ -6102,17 +6750,20 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        struct perf_event *parent_event;
+        if (child_event->parent) {
+                raw_spin_lock_irq(&child_ctx->lock);
+                perf_group_detach(child_event);
+                raw_spin_unlock_irq(&child_ctx->lock);
+        }
-        perf_event_remove_from_context(child_event);
+        perf_remove_from_context(child_event);
-        parent_event = child_event->parent;
        /*
-         * It can happen that parent exits first, and has events
+         * It can happen that the parent exits first, and has events
         * that are still around due to the child reference. These
-         * events need to be zapped - but otherwise linger.
+         * events need to be zapped.
         */
-        if (parent_event) {
+        if (child_event->parent) {
                sync_child_event(child_event, child);
                free_event(child_event);
        }
@@ -6411,7 +7062,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                return 0;
        }
-        child_ctx = child->perf_event_ctxp[ctxn];
+        child_ctx = child->perf_event_ctxp[ctxn];
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
@@ -6526,6 +7177,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
        mutex_unlock(&parent_ctx->mutex);
        perf_unpin_context(parent_ctx);
+        put_ctx(parent_ctx);
        return ret;
 }
@@ -6595,9 +7247,9 @@ static void __perf_event_exit_context(void *__info)
        perf_pmu_rotate_stop(ctx->pmu);
        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-                __perf_event_remove_from_context(event);
+                __perf_remove_from_context(event);
        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-                __perf_event_remove_from_context(event);
+                __perf_remove_from_context(event);
 }
 static void perf_event_exit_cpu_context(int cpu)
@@ -6721,3 +7373,83 @@ unlock:
        return ret;
 }
 device_initcall(perf_event_sysfs_init);
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+        struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        struct perf_cgroup *jc;
+        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
+        if (!jc)
+                return ERR_PTR(-ENOMEM);
+        jc->info = alloc_percpu(struct perf_cgroup_info);
+        if (!jc->info) {
+                kfree(jc);
+                return ERR_PTR(-ENOMEM);
+        }
+        return &jc->css;
+}
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+                                struct cgroup *cont)
+{
+        struct perf_cgroup *jc;
+        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+                          struct perf_cgroup, css);
+        free_percpu(jc->info);
+        kfree(jc);
+}
+static int __perf_cgroup_move(void *info)
+{
+        struct task_struct *task = info;
+        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+        return 0;
+}
+static void perf_cgroup_move(struct task_struct *task)
+{
+        task_function_call(task, __perf_cgroup_move, task);
+}
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task,
+                bool threadgroup)
+{
+        perf_cgroup_move(task);
+        if (threadgroup) {
+                struct task_struct *c;
+                rcu_read_lock();
+                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+                        perf_cgroup_move(c);
+                }
+                rcu_read_unlock();
+        }
+}
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task)
+{
+        /*
+         * cgroup_exit() is called in the copy_process() failure path.
+         * Ignore this case since the task hasn't ran yet, this avoids
+         * trying to poke a half freed task state from generic code.
+         */
+        if (!(task->flags & PF_EXITING))
+                return;
+        perf_cgroup_move(task);
+}
+struct cgroup_subsys perf_subsys = {
+        .name = "perf_event",
+        .subsys_id = perf_subsys_id,
+        .create = perf_cgroup_create,
+        .destroy = perf_cgroup_destroy,
+        .exit = perf_cgroup_exit,
+        .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
        return -1;
 }
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 {
        int offset;
        struct pidmap *map, *end;
+        if (last >= PID_MAX_LIMIT)
+                return -1;
        offset = (last + 1) & BITS_PER_PAGE_MASK;
        map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
        end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
        rcu_read_unlock();
        return pid;
 }
+EXPORT_SYMBOL_GPL(get_task_pid);
 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
@@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
        rcu_read_unlock();
        return result;
 }
+EXPORT_SYMBOL_GPL(get_pid_task);
 struct pid *find_get_pid(pid_t nr)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
+#include <linux/proc_fs.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 {
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
-        int i;
+        int i, err = -ENOMEM;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
        for (i = 1; i < PIDMAP_ENTRIES; i++)
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+        err = pid_ns_prepare_proc(ns);
+        if (err)
+                goto out_put_parent_pid_ns;
        return ns;
+out_put_parent_pid_ns:
+        put_pid_ns(parent_pid_ns);
 out_free_map:
        kfree(ns->pidmap[0].page);
 out_free:
        kmem_cache_free(pid_ns_cachep, ns);
 out:
-        return ERR_PTR(-ENOMEM);
+        return ERR_PTR(err);
 }
 static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..0da058bff8eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+                size_t count, loff_t *f_pos);
 static int pm_qos_power_open(struct inode *inode, struct file *filp);
 static int pm_qos_power_release(struct inode *inode, struct file *filp);
 static const struct file_operations pm_qos_power_fops = {
        .write = pm_qos_power_write,
+        .read = pm_qos_power_read,
        .open = pm_qos_power_open,
        .release = pm_qos_power_release,
        .llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
 }
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+                size_t count, loff_t *f_pos)
+{
+        s32 value;
+        unsigned long flags;
+        struct pm_qos_object *o;
+        struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+        if (!pm_qos_req)
+                return -EINVAL;
+        if (!pm_qos_request_active(pm_qos_req))
+                return -EINVAL;
+        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        value = pm_qos_get_value(o);
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..0791b13df7bf 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
        return p->utime;
 }
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
        int error = check_clock(which_clock);
        if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
        return error;
 }
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
        /*
         * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 }
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
 * new timer already all-zeros initialized.
 */
-int posix_cpu_timer_create(struct k_itimer *new_timer)
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
        int ret = 0;
        const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
-int posix_cpu_timer_del(struct k_itimer *timer)
+static int posix_cpu_timer_del(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
        int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
-                        struct itimerspec *new, struct itimerspec *old)
+                               struct itimerspec *new, struct itimerspec *old)
 {
        struct task_struct *p = timer->it.cpu.task;
        union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        return ret;
 }
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
        union cpu_time_count now;
        struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
        /*
         * Now that all the timers on our list have the firing flag,
-         * noone will touch their list entries but us.  We'll take
+         * no one will touch their list entries but us.  We'll take
         * each timer's lock before clearing its firing flag, so no
         * timer call will interfere.
         */
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
        return error;
 }
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
-                     struct timespec *rqtp, struct timespec __user *rmtp)
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+                            struct timespec *rqtp, struct timespec __user *rmtp)
 {
        struct restart_block *restart_block =
-            &current_thread_info()->restart_block;
+                &current_thread_info()->restart_block;
        struct itimerspec it;
        int error;
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
        if (error == -ERESTART_RESTARTBLOCK) {
-                if (flags & TIMER_ABSTIME)
+                if (flags & TIMER_ABSTIME)
                        return -ERESTARTNOHAND;
                /*
-                 * Report back to the user the time still remaining.
+                 * Report back to the user the time still remaining.
-                 */
+                 */
-                if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
                restart_block->fn = posix_cpu_nsleep_restart;
-                restart_block->arg0 = which_clock;
+                restart_block->nanosleep.index = which_clock;
-                restart_block->arg1 = (unsigned long) rmtp;
+                restart_block->nanosleep.rmtp = rmtp;
-                restart_block->arg2 = rqtp->tv_sec;
+                restart_block->nanosleep.expires = timespec_to_ns(rqtp);
-                restart_block->arg3 = rqtp->tv_nsec;
        }
        return error;
 }
-long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
-        clockid_t which_clock = restart_block->arg0;
+        clockid_t which_clock = restart_block->nanosleep.index;
-        struct timespec __user *rmtp;
        struct timespec t;
        struct itimerspec it;
        int error;
-        rmtp = (struct timespec __user *) restart_block->arg1;
+        t = ns_to_timespec(restart_block->nanosleep.expires);
-        t.tv_sec = restart_block->arg2;
-        t.tv_nsec = restart_block->arg3;
-        restart_block->fn = do_no_restart_syscall;
        error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
        if (error == -ERESTART_RESTARTBLOCK) {
+                struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
                /*
-                 * Report back to the user the time still remaining.
+                 * Report back to the user the time still remaining.
-                 */
+                 */
-                if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
-                restart_block->fn = posix_cpu_nsleep_restart;
+                restart_block->nanosleep.expires = timespec_to_ns(&t);
-                restart_block->arg0 = which_clock;
-                restart_block->arg1 = (unsigned long) rmtp;
-                restart_block->arg2 = t.tv_sec;
-                restart_block->arg3 = t.tv_nsec;
        }
        return error;
 }
 #define PROCESS_CLOCK   MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK    MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
        timer->it_clock = THREAD_CLOCK;
        return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
-                              struct timespec *rqtp, struct timespec __user *rmtp)
+struct k_clock clock_posix_cpu = {
-{
+        .clock_getres   = posix_cpu_clock_getres,
-        return -EINVAL;
+        .clock_set      = posix_cpu_clock_set,
-}
+        .clock_get      = posix_cpu_clock_get,
-static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+        .timer_create   = posix_cpu_timer_create,
-{
+        .nsleep         = posix_cpu_nsleep,
-        return -EINVAL;
+        .nsleep_restart = posix_cpu_nsleep_restart,
-}
+        .timer_set      = posix_cpu_timer_set,
+        .timer_del      = posix_cpu_timer_del,
+        .timer_get      = posix_cpu_timer_get,
+};
 static __init int init_posix_cpu_timers(void)
 {
        struct k_clock process = {
-                .clock_getres = process_cpu_clock_getres,
+                .clock_getres   = process_cpu_clock_getres,
-                .clock_get = process_cpu_clock_get,
+                .clock_get      = process_cpu_clock_get,
-                .clock_set = do_posix_clock_nosettime,
+                .timer_create   = process_cpu_timer_create,
-                .timer_create = process_cpu_timer_create,
+                .nsleep         = process_cpu_nsleep,
-                .nsleep = process_cpu_nsleep,
+                .nsleep_restart = process_cpu_nsleep_restart,
-                .nsleep_restart = process_cpu_nsleep_restart,
        };
        struct k_clock thread = {
-                .clock_getres = thread_cpu_clock_getres,
+                .clock_getres   = thread_cpu_clock_getres,
-                .clock_get = thread_cpu_clock_get,
+                .clock_get      = thread_cpu_clock_get,
-                .clock_set = do_posix_clock_nosettime,
+                .timer_create   = thread_cpu_timer_create,
-                .timer_create = thread_cpu_timer_create,
-                .nsleep = thread_cpu_nsleep,
-                .nsleep_restart = thread_cpu_nsleep_restart,
        };
        struct timespec ts;
-        register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+        posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
-        register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+        posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
        cputime_to_timespec(cputime_one_jiffy, &ts);
        onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..e5498d7405c3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/idr.h>
+#include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
 #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
 #endif
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
 /*
 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
 /*
 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
 *          to implement others.  This structure defines the various
- *          clocks and allows the possibility of adding others.  We
+ *          clocks.
- *          provide an interface to add clocks to the table and expect
- *          the "arch" code to add at least one clock that is high
- *          resolution.  Here we define the standard CLOCK_REALTIME as a
- *          1/HZ resolution clock.
 *
 * RESOLUTION: Clock resolution is used to round up timer and interval
 *          times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
 *          necessary code is written.  The standard says we should say
 *          something about this issue in the documentation...
 *
- * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
- *          various clock functions.  For clocks that use the standard
+ *          handle various clock functions.
- *          system timer code these entries should be NULL.  This will
- *          allow dispatch without the overhead of indirect function
- *          calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
- *          must supply functions here, even if the function just returns
- *          ENOSYS.  The standard POSIX timer management code assumes the
- *          following: 1.) The k_itimer struct (sched.h) is used for the
- *          timer.  2.) The list, it_lock, it_clock, it_id and it_pid
- *          fields are not modified by timer code.
 *
- *          At this time all functions EXCEPT clock_nanosleep can be
+ *          The standard POSIX timer management code assumes the
- *          redirected by the CLOCKS structure.  Clock_nanosleep is in
+ *          following: 1.) The k_itimer struct (sched.h) is used for
- *          there, but the code ignores it.
+ *          the timer.  2.) The list, it_lock, it_clock, it_id and
+ *          it_pid fields are not modified by timer code.
 *
 * Permissions: It is assumed that the clock_settime() function defined
 *          for each clock will take care of permission checks.  Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 */
 static int common_nsleep(const clockid_t, int flags, struct timespec *t,
                         struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
                            struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
        spin_unlock_irqrestore(&timr->it_lock, flags);
 }
-/*
+/* Get clock_realtime */
- * Call the k_clock hook function if non-null, or the default function.
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
- */
-#define CLOCK_DISPATCH(clock, call, arglist) \
-        ((clock) < 0 ? posix_cpu_##call arglist : \
-         (posix_clocks[clock].call != NULL \
-          ? (*posix_clocks[clock].call) arglist : common_##call arglist))
-/*
- * Default clock hook functions when the struct k_clock passed
- * to register_posix_clock leaves a function pointer null.
- *
- * The function common_CALL is the default implementation for
- * the function pointer CALL in struct k_clock.
- */
-static inline int common_clock_getres(const clockid_t which_clock,
-                                      struct timespec *tp)
-{
-        tp->tv_sec = 0;
-        tp->tv_nsec = posix_clocks[which_clock].res;
-        return 0;
-}
-/*
- * Get real time for posix timers
- */
-static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 {
        ktime_get_real_ts(tp);
        return 0;
 }
-static inline int common_clock_set(const clockid_t which_clock,
+/* Set clock_realtime */
-                                   struct timespec *tp)
+static int posix_clock_realtime_set(const clockid_t which_clock,
+                                    const struct timespec *tp)
 {
        return do_sys_settimeofday(tp, NULL);
 }
-static int common_timer_create(struct k_itimer *new_timer)
+static int posix_clock_realtime_adj(const clockid_t which_clock,
-{
+                                    struct timex *t)
-        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
-        return 0;
-}
-static int no_timer_create(struct k_itimer *new_timer)
-{
-        return -EOPNOTSUPP;
-}
-static int no_nsleep(const clockid_t which_clock, int flags,
-                     struct timespec *tsave, struct timespec __user *rmtp)
-{
-        return -EOPNOTSUPP;
-}
-/*
- * Return nonzero if we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(const clockid_t which_clock)
 {
-        if (which_clock < 0)    /* CPU clock, posix_cpu_* will check it */
+        return do_adjtimex(t);
-                return 0;
-        if ((unsigned) which_clock >= MAX_CLOCKS)
-                return 1;
-        if (posix_clocks[which_clock].clock_getres != NULL)
-                return 0;
-        if (posix_clocks[which_clock].res != 0)
-                return 0;
-        return 1;
 }
 /*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 }
 /*
- * Get monotonic time for posix timers
+ * Get monotonic-raw time for posix timers
 */
 static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 {
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
        *tp = ktime_to_timespec(KTIME_LOW_RES);
        return 0;
 }
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+        get_monotonic_boottime(tp);
+        return 0;
+}
 /*
 * Initialize everything, well, just everything in Posix clocks/timers ;)
 */
 static __init int init_posix_timers(void)
 {
        struct k_clock clock_realtime = {
-                .clock_getres = hrtimer_get_res,
+                .clock_getres   = hrtimer_get_res,
+                .clock_get      = posix_clock_realtime_get,
+                .clock_set      = posix_clock_realtime_set,
+                .clock_adj      = posix_clock_realtime_adj,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
        };
        struct k_clock clock_monotonic = {
-                .clock_getres = hrtimer_get_res,
+                .clock_getres   = hrtimer_get_res,
-                .clock_get = posix_ktime_get_ts,
+                .clock_get      = posix_ktime_get_ts,
-                .clock_set = do_posix_clock_nosettime,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
        };
        struct k_clock clock_monotonic_raw = {
-                .clock_getres = hrtimer_get_res,
+                .clock_getres   = hrtimer_get_res,
-                .clock_get = posix_get_monotonic_raw,
+                .clock_get      = posix_get_monotonic_raw,
-                .clock_set = do_posix_clock_nosettime,
-                .timer_create = no_timer_create,
-                .nsleep = no_nsleep,
        };
        struct k_clock clock_realtime_coarse = {
-                .clock_getres = posix_get_coarse_res,
+                .clock_getres   = posix_get_coarse_res,
-                .clock_get = posix_get_realtime_coarse,
+                .clock_get      = posix_get_realtime_coarse,
-                .clock_set = do_posix_clock_nosettime,
-                .timer_create = no_timer_create,
-                .nsleep = no_nsleep,
        };
        struct k_clock clock_monotonic_coarse = {
-                .clock_getres = posix_get_coarse_res,
+                .clock_getres   = posix_get_coarse_res,
-                .clock_get = posix_get_monotonic_coarse,
+                .clock_get      = posix_get_monotonic_coarse,
-                .clock_set = do_posix_clock_nosettime,
+        };
-                .timer_create = no_timer_create,
+        struct k_clock clock_boottime = {
-                .nsleep = no_nsleep,
+                .clock_getres   = hrtimer_get_res,
+                .clock_get      = posix_get_boottime,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
        };
-        register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+        posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
-        register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+        posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
-        register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+        posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
-        register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+        posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
-        register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+        posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+        posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -342,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 * restarted (i.e. we have flagged this in the sys_private entry of the
 * info block).
 *
- * To protect aginst the timer going away while the interrupt is queued,
+ * To protect against the timer going away while the interrupt is queued,
 * we require that the it_requeue_pending flag be set.
 */
 void do_schedule_next_timer(struct siginfo *info)
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
        return task_pid(rtn);
 }
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+void posix_timers_register_clock(const clockid_t clock_id,
+                                 struct k_clock *new_clock)
 {
        if ((unsigned) clock_id >= MAX_CLOCKS) {
-                printk("POSIX clock register failed for clock_id %d\n",
+                printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+                       clock_id);
+                return;
+        }
+        if (!new_clock->clock_get) {
+                printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+                       clock_id);
+                return;
+        }
+        if (!new_clock->clock_getres) {
+                printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
                       clock_id);
                return;
        }
        posix_clocks[clock_id] = *new_clock;
 }
-EXPORT_SYMBOL_GPL(register_posix_clock);
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
 static struct k_itimer * alloc_posix_timer(void)
 {
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
        kmem_cache_free(posix_timers_cache, tmr);
 }
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+        if (id < 0)
+                return (id & CLOCKFD_MASK) == CLOCKFD ?
+                        &clock_posix_dynamic : &clock_posix_cpu;
+        if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+                return NULL;
+        return &posix_clocks[id];
+}
+static int common_timer_create(struct k_itimer *new_timer)
+{
+        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+        return 0;
+}
 /* Create a POSIX.1b interval timer. */
 SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                struct sigevent __user *, timer_event_spec,
                timer_t __user *, created_timer_id)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct k_itimer *new_timer;
        int error, new_timer_id;
        sigevent_t event;
        int it_id_set = IT_ID_NOT_SET;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
+        if (!kc->timer_create)
+                return -EOPNOTSUPP;
        new_timer = alloc_posix_timer();
        if (unlikely(!new_timer))
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                goto out;
        }
-        error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+        error = kc->timer_create(new_timer);
        if (error)
                goto out;
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
        spin_unlock_irq(&current->sighand->siglock);
        return 0;
-        /*
+        /*
         * In the case of the timer belonging to another task, after
         * the task is unlocked, the timer is owned by the other task
         * and may cease to exist at any time.  Don't use or modify
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct itimerspec __user *, setting)
 {
-        struct k_itimer *timr;
        struct itimerspec cur_setting;
+        struct k_itimer *timr;
+        struct k_clock *kc;
        unsigned long flags;
+        int ret = 0;
        timr = lock_timer(timer_id, &flags);
        if (!timr)
                return -EINVAL;
-        CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+        kc = clockid_to_kclock(timr->it_clock);
+        if (WARN_ON_ONCE(!kc || !kc->timer_get))
+                ret = -EINVAL;
+        else
+                kc->timer_get(timr, &cur_setting);
        unlock_timer(timr, flags);
-        if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+        if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
                return -EFAULT;
-        return 0;
+        return ret;
 }
 /*
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
        int error = 0;
        unsigned long flag;
        struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+        struct k_clock *kc;
        if (!new_setting)
                return -EINVAL;
@@ -828,8 +838,11 @@ retry:
        if (!timr)
                return -EINVAL;
-        error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+        kc = clockid_to_kclock(timr->it_clock);
-                               (timr, flags, &new_spec, rtn));
+        if (WARN_ON_ONCE(!kc || !kc->timer_set))
+                error = -EINVAL;
+        else
+                error = kc->timer_set(timr, flags, &new_spec, rtn);
        unlock_timer(timr, flag);
        if (error == TIMER_RETRY) {
@@ -844,7 +857,7 @@ retry:
        return error;
 }
-static inline int common_timer_del(struct k_itimer *timer)
+static int common_timer_del(struct k_itimer *timer)
 {
        timer->it.real.interval.tv64 = 0;
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
-        return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+        struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+        if (WARN_ON_ONCE(!kc || !kc->timer_del))
+                return -EINVAL;
+        return kc->timer_del(timer);
 }
 /* Delete a POSIX.1b interval timer. */
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
        }
 }
-/* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
-{
-        return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
-int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
-                               struct timespec *t, struct timespec __user *r)
-{
-#ifndef ENOTSUP
-        return -EOPNOTSUPP;     /* aka ENOTSUP in userland for POSIX */
-#else  /*  parisc does define it separately.  */
-        return -ENOTSUP;
-#endif
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct timespec __user *, tp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec new_tp;
-        if (invalid_clockid(which_clock))
+        if (!kc || !kc->clock_set)
                return -EINVAL;
        if (copy_from_user(&new_tp, tp, sizeof (*tp)))
                return -EFAULT;
-        return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+        return kc->clock_set(which_clock, &new_tp);
 }
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct timespec __user *,tp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec kernel_tp;
        int error;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
-        error = CLOCK_DISPATCH(which_clock, clock_get,
-                               (which_clock, &kernel_tp));
+        error = kc->clock_get(which_clock, &kernel_tp);
        if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
                error = -EFAULT;
        return error;
+}
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+                struct timex __user *, utx)
+{
+        struct k_clock *kc = clockid_to_kclock(which_clock);
+        struct timex ktx;
+        int err;
+        if (!kc)
+                return -EINVAL;
+        if (!kc->clock_adj)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&ktx, utx, sizeof(ktx)))
+                return -EFAULT;
+        err = kc->clock_adj(which_clock, &ktx);
+        if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+                return -EFAULT;
+        return err;
 }
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct timespec __user *, tp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec rtn_tp;
        int error;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
-        error = CLOCK_DISPATCH(which_clock, clock_getres,
+        error = kc->clock_getres(which_clock, &rtn_tp);
-                               (which_clock, &rtn_tp));
-        if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+        if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
                error = -EFAULT;
-        }
        return error;
 }
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct timespec __user *, rqtp,
                struct timespec __user *, rmtp)
 {
+        struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec t;
-        if (invalid_clockid(which_clock))
+        if (!kc)
                return -EINVAL;
+        if (!kc->nsleep)
+                return -ENANOSLEEP_NOTSUP;
        if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
                return -EFAULT;
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
        if (!timespec_valid(&t))
                return -EINVAL;
-        return CLOCK_DISPATCH(which_clock, nsleep,
+        return kc->nsleep(which_clock, flags, &t, rmtp);
-                              (which_clock, flags, &t, rmtp));
-}
-/*
- * nanosleep_restart for monotonic and realtime clocks
- */
-static int common_nsleep_restart(struct restart_block *restart_block)
-{
-        return hrtimer_nanosleep_restart(restart_block);
 }
 /*
 * This will restart clock_nanosleep. This is required only by
 * compat_clock_nanosleep_restart for now.
 */
-long
+long clock_nanosleep_restart(struct restart_block *restart_block)
-clock_nanosleep_restart(struct restart_block *restart_block)
 {
-        clockid_t which_clock = restart_block->arg0;
+        clockid_t which_clock = restart_block->nanosleep.index;
+        struct k_clock *kc = clockid_to_kclock(which_clock);
+        if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+                return -EINVAL;
-        return CLOCK_DISPATCH(which_clock, nsleep_restart,
+        return kc->nsleep_restart(restart_block);
-                              (restart_block));
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..6de9a8fc3417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
-config PM
-        bool "Power Management support"
-        depends on !IA64_HP_SIM
-        ---help---
-          "Power Management" means that parts of your computer are shut
-          off or put into a power conserving "sleep" mode if they are not
-          being used.  There are two competing standards for doing this: APM
-          and ACPI.  If you want to use either one, say Y here and then also
-          to the requisite support below.
-          Power Management is most important for battery powered laptop
-          computers; if you have a laptop, check out the Linux Laptop home
-          page on the WWW at <http://www.linux-on-laptops.com/> or
-          Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
-          and the Battery Powered Linux mini-HOWTO, available from
-          <http://www.tldp.org/docs.html#howto>.
-          Note that, even if you say N here, Linux on the x86 architecture
-          will issue the hlt instruction if nothing is to be done, thereby
-          sending the processor to sleep and saving power.
-config PM_DEBUG
-        bool "Power Management Debug Support"
-        depends on PM
-        ---help---
-        This option enables various debugging support in the Power Management
-        code. This is helpful when debugging and reporting PM bugs, like
-        suspend support.
-config PM_ADVANCED_DEBUG
-        bool "Extra PM attributes in sysfs for low-level debugging/testing"
-        depends on PM_DEBUG
-        default n
-        ---help---
-        Add extra sysfs attributes allowing one to access some Power Management
-        fields of device objects from user space.  If you are not a kernel
-        developer interested in debugging/testing Power Management, say "no".
-config PM_VERBOSE
-        bool "Verbose Power Management debugging"
-        depends on PM_DEBUG
-        default n
-        ---help---
-        This option enables verbose messages from the Power Management code.
-config CAN_PM_TRACE
-        def_bool y
-        depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
-config PM_TRACE
-        bool
-        help
-          This enables code to save the last PM event point across
-          reboot. The architecture needs to support this, x86 for
-          example does by saving things in the RTC, see below.
-          The architecture specific code must provide the extern
-          functions from <linux/resume-trace.h> as well as the
-          <asm/resume-trace.h> header with a TRACE_RESUME() macro.
-          The way the information is presented is architecture-
-          dependent, x86 will print the information during a
-          late_initcall.
-config PM_TRACE_RTC
-        bool "Suspend/resume event tracing"
-        depends on CAN_PM_TRACE
-        depends on X86
-        select PM_TRACE
-        default n
-        ---help---
-        This enables some cheesy code to save the last PM event point in the
-        RTC across reboots, so that you can debug a machine that just hangs
-        during suspend (or more commonly, during resume).
-        To use this debugging feature you should attempt to suspend the
-        machine, reboot it and then run
-                dmesg -s 1000000 | grep 'hash matches'
-        CAUTION: this option will cause your machine's real-time clock to be
-        set to an invalid time after a resume.
-config PM_SLEEP_SMP
-        bool
-        depends on SMP
-        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
-        depends on PM_SLEEP
-        select HOTPLUG
-        select HOTPLUG_CPU
-        default y
-config PM_SLEEP
-        bool
-        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
-        default y
-config PM_SLEEP_ADVANCED_DEBUG
-        bool
-        depends on PM_ADVANCED_DEBUG
-        default n
 config SUSPEND
        bool "Suspend to RAM and standby"
-        depends on PM && ARCH_SUSPEND_POSSIBLE
+        depends on ARCH_SUSPEND_POSSIBLE
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
          powered and thus its contents are preserved, such as the
          suspend-to-RAM state (e.g. the ACPI S3 state).
-config PM_TEST_SUSPEND
-        bool "Test suspend/resume and wakealarm during bootup"
-        depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
-        ---help---
-        This option will let you suspend your machine during bootup, and
-        make it wake up a few seconds later using an RTC wakeup alarm.
-        Enable this with a kernel parameter like "test_suspend=mem".
-        You probably want to have your system's RTC driver statically
-        linked, ensuring that it's available when this test runs.
 config SUSPEND_FREEZER
        bool "Enable freezer for suspend to RAM/standby" \
                if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -131,9 +18,13 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATE_CALLBACKS
+        bool
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
-        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+        depends on SWAP && ARCH_HIBERNATION_POSSIBLE
+        select HIBERNATE_CALLBACKS
        select LZO_COMPRESS
        select LZO_DECOMPRESS
        ---help---
@@ -196,6 +87,106 @@ config PM_STD_PARTITION
          suspended image to. It will simply pick the first available swap 
          device.
+config PM_SLEEP
+        def_bool y
+        depends on SUSPEND || HIBERNATE_CALLBACKS
+config PM_SLEEP_SMP
+        def_bool y
+        depends on SMP
+        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+        depends on PM_SLEEP
+        select HOTPLUG
+        select HOTPLUG_CPU
+config PM_RUNTIME
+        bool "Run-time PM core functionality"
+        depends on !IA64_HP_SIM
+        ---help---
+          Enable functionality allowing I/O devices to be put into energy-saving
+          (low power) states at run time (or autosuspended) after a specified
+          period of inactivity and woken up in response to a hardware-generated
+          wake-up event or a driver's request.
+          Hardware support is generally required for this functionality to work
+          and the bus type drivers of the buses the devices are on are
+          responsible for the actual handling of the autosuspend requests and
+          wake-up events.
+config PM
+        def_bool y
+        depends on PM_SLEEP || PM_RUNTIME
+config PM_DEBUG
+        bool "Power Management Debug Support"
+        depends on PM
+        ---help---
+        This option enables various debugging support in the Power Management
+        code. This is helpful when debugging and reporting PM bugs, like
+        suspend support.
+config PM_VERBOSE
+        bool "Verbose Power Management debugging"
+        depends on PM_DEBUG
+        ---help---
+        This option enables verbose messages from the Power Management code.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
+config PM_TEST_SUSPEND
+        bool "Test suspend/resume and wakealarm during bootup"
+        depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+        ---help---
+        This option will let you suspend your machine during bootup, and
+        make it wake up a few seconds later using an RTC wakeup alarm.
+        Enable this with a kernel parameter like "test_suspend=mem".
+        You probably want to have your system's RTC driver statically
+        linked, ensuring that it's available when this test runs.
+config CAN_PM_TRACE
+        def_bool y
+        depends on PM_DEBUG && PM_SLEEP
+config PM_TRACE
+        bool
+        help
+          This enables code to save the last PM event point across
+          reboot. The architecture needs to support this, x86 for
+          example does by saving things in the RTC, see below.
+          The architecture specific code must provide the extern
+          functions from <linux/resume-trace.h> as well as the
+          <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+          The way the information is presented is architecture-
+          dependent, x86 will print the information during a
+          late_initcall.
+config PM_TRACE_RTC
+        bool "Suspend/resume event tracing"
+        depends on CAN_PM_TRACE
+        depends on X86
+        select PM_TRACE
+        ---help---
+        This enables some cheesy code to save the last PM event point in the
+        RTC across reboots, so that you can debug a machine that just hangs
+        during suspend (or more commonly, during resume).
+        To use this debugging feature you should attempt to suspend the
+        machine, reboot it and then run
+                dmesg -s 1000000 | grep 'hash matches'
+        CAUTION: this option will cause your machine's real-time clock to be
+        set to an invalid time after a resume.
 config APM_EMULATION
        tristate "Advanced Power Management Emulation"
        depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +213,11 @@ config APM_EMULATION
          anything, try disabling/enabling this option (or disabling/enabling
          APM in your BIOS).
-config PM_RUNTIME
-        bool "Run-time PM core functionality"
-        depends on PM
-        ---help---
-          Enable functionality allowing I/O devices to be put into energy-saving
-          (low power) states at run time (or autosuspended) after a specified
-          period of inactivity and woken up in response to a hardware-generated
-          wake-up event or a driver's request.
-          Hardware support is generally required for this functionality to work
-          and the bus type drivers of the buses the devices are on are
-          responsible for the actual handling of the autosuspend requests and
-          wake-up events.
-config PM_OPS
-        bool
-        depends on PM_SLEEP || PM_RUNTIME
-        default y
 config ARCH_HAS_OPP
        bool
 config PM_OPP
        bool "Operating Performance Point (OPP) Layer library"
-        depends on PM
        depends on ARCH_HAS_OPP
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
-ccflags-$(CONFIG_PM_DEBUG)      :=      -DDEBUG
+ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_PM_SLEEP)          += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
                struct page *page, struct bio **bio_chain)
 {
-        const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+        const int bio_rw = rw | REQ_SYNC;
        struct bio *bio;
        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..50aae660174d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
@@ -272,6 +273,11 @@ static int create_image(int platform_mode)
        local_irq_disable();
        error = sysdev_suspend(PMSG_FREEZE);
+        if (!error) {
+                error = syscore_suspend();
+                if (error)
+                        sysdev_resume();
+        }
        if (error) {
                printk(KERN_ERR "PM: Some system devices failed to power down, "
                        "aborting hibernation\n");
@@ -295,6 +301,7 @@ static int create_image(int platform_mode)
        }
 Power_up:
+        syscore_resume();
        sysdev_resume();
        /* NOTE:  dpm_resume_noirq() is just a resume() for devices
         * that suspended with irqs off ... no overall powerup.
@@ -403,6 +410,11 @@ static int resume_target_kernel(bool platform_mode)
        local_irq_disable();
        error = sysdev_suspend(PMSG_QUIESCE);
+        if (!error) {
+                error = syscore_suspend();
+                if (error)
+                        sysdev_resume();
+        }
        if (error)
                goto Enable_irqs;
@@ -429,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
        restore_processor_state();
        touch_softlockup_watchdog();
+        syscore_resume();
        sysdev_resume();
 Enable_irqs:
@@ -516,6 +529,7 @@ int hibernation_platform_enter(void)
        local_irq_disable();
        sysdev_suspend(PMSG_HIBERNATE);
+        syscore_suspend();
        if (pm_wakeup_pending()) {
                error = -EAGAIN;
                goto Power_up;
@@ -526,6 +540,7 @@ int hibernation_platform_enter(void)
        while (1);
 Power_up:
+        syscore_resume();
        sysdev_resume();
        local_irq_enable();
        enable_nonboot_cpus();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..de9aef8742f4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
 DEFINE_MUTEX(pm_mutex);
-unsigned int pm_flags;
-EXPORT_SYMBOL(pm_flags);
 #ifdef CONFIG_PM_SLEEP
 /* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
 * writing to 'state'.  It first should read from 'wakeup_count' and store
 * the read value.  Then, after carrying out its own preparations for the system
 * transition to a sleep state, it should write the stored value to
- * 'wakeup_count'.  If that fails, at least one wakeup event has occured since
+ * 'wakeup_count'.  If that fails, at least one wakeup event has occurred since
 * 'wakeup_count' was read and 'state' should not be written to.  Otherwise, it
 * is allowed to write to 'state', but the transition will be aborted if there
 * are any wakeup events detected after 'wakeup_count' was written to.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ca0aacc24874 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
 /*
 * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
+ * When it is set to N, the image creating code will do its best to
- * size will not exceed N bytes, but if that is impossible, it will
+ * ensure the image size will not exceed N bytes, but if that is
- * try to create the smallest image possible.
+ * impossible, it will try to create the smallest image possible.
 */
 unsigned long image_size;
 void __init hibernate_image_size_init(void)
 {
-        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+        image_size = (totalram_pages / 3) * PAGE_SIZE;
 }
 /* List of PBEs needed for restoring the pages that were allocated before
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..8935369d503a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
+#include <linux/syscore_ops.h>
 #include <trace/events/power.h>
 #include "power.h"
@@ -164,10 +165,16 @@ static int suspend_enter(suspend_state_t state)
        error = sysdev_suspend(PMSG_SUSPEND);
        if (!error) {
+                error = syscore_suspend();
+                if (error)
+                        sysdev_resume();
+        }
+        if (!error) {
                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
+                syscore_resume();
                sysdev_resume();
        }
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22f..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define __LOG_BUF_LEN   (1 << CONFIG_LOG_BUF_SHIFT)
 /* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 /* We show everything that is MORE important than this.. */
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start;	/* Index into log_buf: next char to be sent to consol
 static unsigned log_end;        /* Index into log_buf: most-recently-written-char + 1 */
 /*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+/*
 *      Array of consoles built from command line options (console=)
 */
 struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
        struct console *con;
        for_each_console(con) {
+                if (exclusive_console && con != exclusive_console)
+                        continue;
                if ((con->flags & CON_ENABLED) && con->write &&
                                (cpu_online(smp_processor_id()) ||
                                (con->flags & CON_ANYTIME)))
@@ -515,6 +522,71 @@ static void _call_console_drivers(unsigned start,
 }
 /*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+        unsigned int lev = 0;
+        char sp = '\0';
+        size_t len;
+        if (p[0] != '<' || !p[1])
+                return 0;
+        if (p[2] == '>') {
+                /* usual single digit level number or special char */
+                switch (p[1]) {
+                case '0' ... '7':
+                        lev = p[1] - '0';
+                        break;
+                case 'c': /* KERN_CONT */
+                case 'd': /* KERN_DEFAULT */
+                        sp = p[1];
+                        break;
+                default:
+                        return 0;
+                }
+                len = 3;
+        } else {
+                /* multi digit including the level and facility number */
+                char *endp = NULL;
+                if (p[1] < '0' && p[1] > '9')
+                        return 0;
+                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+                if (endp == NULL || endp[0] != '>')
+                        return 0;
+                len = (endp + 1) - p;
+        }
+        /* do not accept special char if not asked for */
+        if (sp && !special)
+                return 0;
+        if (special) {
+                *special = sp;
+                /* return special char, do not touch level */
+                if (sp)
+                        return len;
+        }
+        if (level)
+                *level = lev;
+        return len;
+}
+/*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
 * The console_lock must be held.
@@ -529,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end)
        cur_index = start;
        start_print = start;
        while (cur_index != end) {
-                if (msg_level < 0 && ((end - cur_index) > 2) &&
+                if (msg_level < 0 && ((end - cur_index) > 2)) {
-                                LOG_BUF(cur_index + 0) == '<' &&
+                        /* strip log prefix */
-                                LOG_BUF(cur_index + 1) >= '0' &&
+                        cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
-                                LOG_BUF(cur_index + 1) <= '7' &&
-                                LOG_BUF(cur_index + 2) == '>') {
-                        msg_level = LOG_BUF(cur_index + 1) - '0';
-                        cur_index += 3;
                        start_print = cur_index;
                }
                while (cur_index != end) {
@@ -733,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        unsigned long flags;
        int this_cpu;
        char *p;
+        size_t plen;
+        char special;
        boot_delay_msec();
        printk_delay();
@@ -773,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        printed_len += vscnprintf(printk_buf + printed_len,
                                  sizeof(printk_buf) - printed_len, fmt, args);
        p = printk_buf;
-        /* Do we have a loglevel in the string? */
+        /* Read log level and handle special printk prefix */
-        if (p[0] == '<') {
+        plen = log_prefix(p, &current_log_level, &special);
-                unsigned char c = p[1];
+        if (plen) {
-                if (c && p[2] == '>') {
+                p += plen;
-                        switch (c) {
-                        case '0' ... '7': /* loglevel */
+                switch (special) {
-                                current_log_level = c - '0';
+                case 'c': /* Strip <c> KERN_CONT, continue line */
-                        /* Fallthrough - make sure we're on a new line */
+                        plen = 0;
-                        case 'd': /* KERN_DEFAULT */
+                        break;
-                                if (!new_text_line) {
+                case 'd': /* Strip <d> KERN_DEFAULT, start new line */
-                                        emit_log_char('\n');
+                        plen = 0;
-                                        new_text_line = 1;
+                default:
-                                }
+                        if (!new_text_line) {
-                        /* Fallthrough - skip the loglevel */
+                                emit_log_char('\n');
-                        case 'c': /* KERN_CONT */
+                                new_text_line = 1;
-                                p += 3;
-                                break;
                        }
                }
        }
        /*
-         * Copy the output into log_buf.  If the caller didn't provide
+         * Copy the output into log_buf. If the caller didn't provide
-         * appropriate log level tags, we insert them here
+         * the appropriate log prefix, we insert them here
         */
-        for ( ; *p; p++) {
+        for (; *p; p++) {
                if (new_text_line) {
-                        /* Always output the token */
-                        emit_log_char('<');
-                        emit_log_char(current_log_level + '0');
-                        emit_log_char('>');
-                        printed_len += 3;
                        new_text_line = 0;
+                        if (plen) {
+                                /* Copy original log prefix */
+                                int i;
+                                for (i = 0; i < plen; i++)
+                                        emit_log_char(printk_buf[i]);
+                                printed_len += plen;
+                        } else {
+                                /* Add log prefix */
+                                emit_log_char('<');
+                                emit_log_char(current_log_level + '0');
+                                emit_log_char('>');
+                                printed_len += 3;
+                        }
                        if (printk_time) {
-                                /* Follow the token with the time */
+                                /* Add the current time stamp */
                                char tbuf[50], *tp;
                                unsigned tlen;
                                unsigned long long t;
@@ -1160,6 +1237,11 @@ void console_unlock(void)
                local_irq_restore(flags);
        }
        console_locked = 0;
+        /* Release the exclusive_console once it is used */
+        if (unlikely(exclusive_console))
+                exclusive_console = NULL;
        up(&console_sem);
        spin_unlock_irqrestore(&logbuf_lock, flags);
        if (wake_klogd)
@@ -1246,6 +1328,18 @@ void console_start(struct console *console)
 }
 EXPORT_SYMBOL(console_start);
+static int __read_mostly keep_bootcon;
+static int __init keep_bootcon_setup(char *str)
+{
+        keep_bootcon = 1;
+        printk(KERN_INFO "debug: skip boot console de-registration.\n");
+        return 0;
+}
+early_param("keep_bootcon", keep_bootcon_setup);
 /*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
@@ -1382,6 +1476,12 @@ void register_console(struct console *newcon)
                spin_lock_irqsave(&logbuf_lock, flags);
                con_start = log_start;
                spin_unlock_irqrestore(&logbuf_lock, flags);
+                /*
+                 * We're about to replay the log buffer.  Only do this to the
+                 * just-registered console to avoid excessive message spam to
+                 * the already-registered consoles.
+                 */
+                exclusive_console = newcon;
        }
        console_unlock();
        console_sysfs_notify();
@@ -1393,7 +1493,9 @@ void register_console(struct console *newcon)
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
-        if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+        if (bcon &&
+            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+            !keep_bootcon) {
                /* we need to iterate through twice, to make sure we print
                 * everything out, before we unregister the console(s)
                 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1708b1e2972d..dc7ab65f3b36 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/regset.h>
+#include <linux/hw_breakpoint.h>
 /*
@@ -134,21 +135,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
                return 0;
        rcu_read_lock();
        tcred = __task_cred(task);
-        if ((cred->uid != tcred->euid ||
+        if (cred->user->user_ns == tcred->user->user_ns &&
-             cred->uid != tcred->suid ||
+            (cred->uid == tcred->euid &&
-             cred->uid != tcred->uid  ||
+             cred->uid == tcred->suid &&
-             cred->gid != tcred->egid ||
+             cred->uid == tcred->uid  &&
-             cred->gid != tcred->sgid ||
+             cred->gid == tcred->egid &&
-             cred->gid != tcred->gid) &&
+             cred->gid == tcred->sgid &&
-            !capable(CAP_SYS_PTRACE)) {
+             cred->gid == tcred->gid))
-                rcu_read_unlock();
+                goto ok;
-                return -EPERM;
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
-        }
+                goto ok;
+        rcu_read_unlock();
+        return -EPERM;
+ok:
        rcu_read_unlock();
        smp_rmb();
        if (task->mm)
                dumpable = get_dumpable(task->mm);
-        if (!dumpable && !capable(CAP_SYS_PTRACE))
+        if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
                return -EPERM;
        return security_ptrace_access_check(task, mode);
@@ -163,7 +167,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
        return !err;
 }
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
 {
        int retval;
@@ -198,7 +202,7 @@ int ptrace_attach(struct task_struct *task)
                goto unlock_tasklist;
        task->ptrace = PT_PTRACED;
-        if (capable(CAP_SYS_PTRACE))
+        if (task_ns_capable(task, CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
@@ -219,7 +223,7 @@ out:
 * Performs checks and sets PT_PTRACED.
 * Should be used by all ptrace implementations for PTRACE_TRACEME.
 */
-int ptrace_traceme(void)
+static int ptrace_traceme(void)
 {
        int ret = -EPERM;
@@ -293,7 +297,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
        return false;
 }
-int ptrace_detach(struct task_struct *child, unsigned int data)
+static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
        bool dead = false;
@@ -876,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        return ret;
 }
 #endif  /* CONFIG_COMPAT */
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+int ptrace_get_breakpoints(struct task_struct *tsk)
+{
+        if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
+                return 0;
+        return -1;
+}
+void ptrace_put_breakpoints(struct task_struct *tsk)
+{
+        if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
+                flush_ptrace_hw_breakpoint(tsk);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * Note that the machinery to reliably determine whether
+                 * or not we are in an RCU read-side critical section
+                 * exists only in the preemptible RCU implementations
+                 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
+                 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
                 */
-#ifndef CONFIG_PREEMPT
-                WARN_ON(1);
-                return 0;
-#else
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
                        WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                rcu_barrier_bh();
                debug_object_free(head, &rcuhead_debug_descr);
                return 1;
-#endif
        default:
                return 0;
        }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
        if (t->rcu_read_lock_nesting == 0)
                return;
        t->rcu_read_lock_nesting = 1;
-        rcu_read_unlock();
+        __rcu_read_unlock();
 }
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
-#include <linux/sched.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
                        pos, buf, s - buf);
 }
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+        unsigned long flags;
+        u64 ret;
+        spin_lock_irqsave(&counter->lock, flags);
+        ret = *res_counter_member(counter, member);
+        spin_unlock_irqrestore(&counter->lock, flags);
+        return ret;
+}
+#else
 u64 res_counter_read_u64(struct res_counter *counter, int member)
 {
        return *res_counter_member(counter, member);
 }
+#endif
 int res_counter_memparse_write_strategy(const char *buf,
                                        unsigned long long *res)
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
        put_pid(waiter->deadlock_task_pid);
        TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
        TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-        TRACE_WARN_ON(waiter->task);
        memset(waiter, 0x22, sizeof(*waiter));
 }
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
        int                     opcode;
        int                     opdata;
        int                     mutexes[MAX_RT_TEST_MUTEXES];
-        int                     bkl;
        int                     event;
        struct sys_device       sysdev;
 };
@@ -46,9 +44,8 @@ enum test_opcodes {
        RTTEST_LOCKINTNOWAIT,   /* 6 Lock interruptible no wait in wakeup, data = lockindex */
        RTTEST_LOCKCONT,        /* 7 Continue locking after the wakeup delay */
        RTTEST_UNLOCK,          /* 8 Unlock, data = lockindex */
-        RTTEST_LOCKBKL,         /* 9 Lock BKL */
+        /* 9, 10 - reserved for BKL commemoration */
-        RTTEST_UNLOCKBKL,       /* 10 Unlock BKL */
+        RTTEST_SIGNAL = 11,     /* 11 Signal other test thread, data = thread id */
-        RTTEST_SIGNAL,          /* 11 Signal other test thread, data = thread id */
        RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
        RTTEST_RESET = 99,      /* 99 Reset all pending operations */
 };
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
                                td->mutexes[i] = 0;
                        }
                }
-                if (!lockwakeup && td->bkl == 4) {
-#ifdef CONFIG_LOCK_KERNEL
-                        unlock_kernel();
-#endif
-                        td->bkl = 0;
-                }
                return 0;
        case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
                td->mutexes[id] = 0;
                return 0;
-        case RTTEST_LOCKBKL:
-                if (td->bkl)
-                        return 0;
-                td->bkl = 1;
-#ifdef CONFIG_LOCK_KERNEL
-                lock_kernel();
-#endif
-                td->bkl = 4;
-                return 0;
-        case RTTEST_UNLOCKBKL:
-                if (td->bkl != 4)
-                        break;
-#ifdef CONFIG_LOCK_KERNEL
-                unlock_kernel();
-#endif
-                td->bkl = 0;
-                return 0;
        default:
                break;
        }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
                td->event = atomic_add_return(1, &rttest_event);
                break;
-        case RTTEST_LOCKBKL:
        default:
                break;
        }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
                td->event = atomic_add_return(1, &rttest_event);
                return;
-        case RTTEST_LOCKBKL:
-                return;
        default:
                return;
        }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
        spin_lock(&rttest_lock);
        curr += sprintf(curr,
-                "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+                "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
                td->opcode, td->event, tsk->state,
                        (MAX_RT_PRIO - 1) - tsk->prio,
                        (MAX_RT_PRIO - 1) - tsk->normal_prio,
-                tsk->pi_blocked_on, td->bkl);
+                tsk->pi_blocked_on);
        for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
                curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
 /*
 * lock->owner state tracking:
 *
- * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
- * are used to keep track of the "owner is pending" and "lock has
+ * is used to keep track of the "lock has waiters" state.
- * waiters" state.
 *
- * owner        bit1    bit0
+ * owner        bit0
- * NULL         0       0       lock is free (fast acquire possible)
+ * NULL         0       lock is free (fast acquire possible)
- * NULL         0       1       invalid state
+ * NULL         1       lock is free and has waiters and the top waiter
- * NULL         1       0       Transitional State*
+ *                              is going to take the lock*
- * NULL         1       1       invalid state
+ * taskpointer  0       lock is held (fast release possible)
- * taskpointer  0       0       lock is held (fast release possible)
+ * taskpointer  1       lock is held and has waiters**
- * taskpointer  0       1       task is pending owner
- * taskpointer  1       0       lock is held and has waiters
- * taskpointer  1       1       task is pending owner and lock has more waiters
- *
- * Pending ownership is assigned to the top (highest priority)
- * waiter of the lock, when the lock is released. The thread is woken
- * up and can now take the lock. Until the lock is taken (bit 0
- * cleared) a competing higher priority thread can steal the lock
- * which puts the woken up thread back on the waiters list.
 *
 * The fast atomic compare exchange based acquire and release is only
- * possible when bit 0 and 1 of lock->owner are 0.
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
 *
- * (*) There's a small time where the owner can be NULL and the
+ * (**) There is a small time when bit 0 is set but there are no
- * "lock has waiters" bit is set.  This can happen when grabbing the lock.
+ * waiters. This can happen when grabbing the lock in the slow path.
- * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
- * bit before looking at the lock, hence the reason this is a transitional
+ * set this bit before looking at the lock.
- * state.
 */
 static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
-                   unsigned long mask)
 {
-        unsigned long val = (unsigned long)owner | mask;
+        unsigned long val = (unsigned long)owner;
        if (rt_mutex_has_waiters(lock))
                val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * reached or the state of the chain has changed while we
         * dropped the locks.
         */
-        if (!waiter || !waiter->task)
+        if (!waiter)
                goto out_unlock_pi;
        /*
         * Check the orig_waiter state. After we dropped the locks,
-         * the previous owner of the lock might have released the lock
+         * the previous owner of the lock might have released the lock.
-         * and made us the pending owner:
         */
-        if (orig_waiter && !orig_waiter->task)
+        if (orig_waiter && !rt_mutex_owner(orig_lock))
                goto out_unlock_pi;
        /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /* Release the task */
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        if (!rt_mutex_owner(lock)) {
+                /*
+                 * If the requeue above changed the top waiter, then we need
+                 * to wake the new top waiter up to try to get the lock.
+                 */
+                if (top_waiter != rt_mutex_top_waiter(lock))
+                        wake_up_process(rt_mutex_top_waiter(lock)->task);
+                raw_spin_unlock(&lock->wait_lock);
+                goto out_put_task;
+        }
        put_task_struct(task);
        /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 }
 /*
- * Optimization: check if we can steal the lock from the
- * assigned pending owner [which might not have taken the
- * lock yet]:
- */
-static inline int try_to_steal_lock(struct rt_mutex *lock,
-                                    struct task_struct *task)
-{
-        struct task_struct *pendowner = rt_mutex_owner(lock);
-        struct rt_mutex_waiter *next;
-        unsigned long flags;
-        if (!rt_mutex_owner_pending(lock))
-                return 0;
-        if (pendowner == task)
-                return 1;
-        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-        if (task->prio >= pendowner->prio) {
-                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-                return 0;
-        }
-        /*
-         * Check if a waiter is enqueued on the pending owners
-         * pi_waiters list. Remove it and readjust pending owners
-         * priority.
-         */
-        if (likely(!rt_mutex_has_waiters(lock))) {
-                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-                return 1;
-        }
-        /* No chain handling, pending owner is not blocked on anything: */
-        next = rt_mutex_top_waiter(lock);
-        plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
-        __rt_mutex_adjust_prio(pendowner);
-        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-        /*
-         * We are going to steal the lock and a waiter was
-         * enqueued on the pending owners pi_waiters queue. So
-         * we have to enqueue this waiter into
-         * task->pi_waiters list. This covers the case,
-         * where task is boosted because it holds another
-         * lock and gets unboosted because the booster is
-         * interrupted, so we would delay a waiter with higher
-         * priority as task->normal_prio.
-         *
-         * Note: in the rare case of a SCHED_OTHER task changing
-         * its priority and thus stealing the lock, next->task
-         * might be task:
-         */
-        if (likely(next->task != task)) {
-                raw_spin_lock_irqsave(&task->pi_lock, flags);
-                plist_add(&next->pi_list_entry, &task->pi_waiters);
-                __rt_mutex_adjust_prio(task);
-                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-        }
-        return 1;
-}
-/*
 * Try to take an rt-mutex
 *
- * This fails
- * - when the lock has a real owner
- * - when a different pending owner exists and has higher priority than current
- *
 * Must be called with lock->wait_lock held.
+ *
+ * @lock:   the lock to be acquired.
+ * @task:   the task which wants to acquire the lock
+ * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
 */
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+                struct rt_mutex_waiter *waiter)
 {
        /*
         * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
         */
        mark_rt_mutex_waiters(lock);
-        if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+        if (rt_mutex_owner(lock))
                return 0;
+        /*
+         * It will get the lock because of one of these conditions:
+         * 1) there is no waiter
+         * 2) higher priority than waiters
+         * 3) it is top waiter
+         */
+        if (rt_mutex_has_waiters(lock)) {
+                if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+                        if (!waiter || waiter != rt_mutex_top_waiter(lock))
+                                return 0;
+                }
+        }
+        if (waiter || rt_mutex_has_waiters(lock)) {
+                unsigned long flags;
+                struct rt_mutex_waiter *top;
+                raw_spin_lock_irqsave(&task->pi_lock, flags);
+                /* remove the queued waiter. */
+                if (waiter) {
+                        plist_del(&waiter->list_entry, &lock->wait_list);
+                        task->pi_blocked_on = NULL;
+                }
+                /*
+                 * We have to enqueue the top waiter(if it exists) into
+                 * task->pi_waiters list.
+                 */
+                if (rt_mutex_has_waiters(lock)) {
+                        top = rt_mutex_top_waiter(lock);
+                        top->pi_list_entry.prio = top->list_entry.prio;
+                        plist_add(&top->pi_list_entry, &task->pi_waiters);
+                }
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        }
        /* We got the lock. */
        debug_rt_mutex_lock(lock);
-        rt_mutex_set_owner(lock, current, 0);
+        rt_mutex_set_owner(lock, task);
-        rt_mutex_deadlock_account_lock(lock, current);
+        rt_mutex_deadlock_account_lock(lock, task);
        return 1;
 }
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        if (!owner)
+                return 0;
        if (waiter == rt_mutex_top_waiter(lock)) {
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
 * Wake up the next waiter on the lock.
 *
- * Remove the top waiter from the current tasks waiter list and from
+ * Remove the top waiter from the current tasks waiter list and wake it up.
- * the lock waiter list. Set it as pending owner. Then wake it up.
 *
 * Called with lock->wait_lock held.
 */
 static void wakeup_next_waiter(struct rt_mutex *lock)
 {
        struct rt_mutex_waiter *waiter;
-        struct task_struct *pendowner;
        unsigned long flags;
        raw_spin_lock_irqsave(&current->pi_lock, flags);
        waiter = rt_mutex_top_waiter(lock);
-        plist_del(&waiter->list_entry, &lock->wait_list);
        /*
         * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         * lock->wait_lock.
         */
        plist_del(&waiter->pi_list_entry, &current->pi_waiters);
-        pendowner = waiter->task;
-        waiter->task = NULL;
-        rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+        rt_mutex_set_owner(lock, NULL);
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
-        /*
+        wake_up_process(waiter->task);
-         * Clear the pi_blocked_on variable and enqueue a possible
-         * waiter into the pi_waiters list of the pending owner. This
-         * prevents that in case the pending owner gets unboosted a
-         * waiter with higher priority than pending-owner->normal_prio
-         * is blocked on the unboosted (pending) owner.
-         */
-        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-        WARN_ON(!pendowner->pi_blocked_on);
-        WARN_ON(pendowner->pi_blocked_on != waiter);
-        WARN_ON(pendowner->pi_blocked_on->lock != lock);
-        pendowner->pi_blocked_on = NULL;
-        if (rt_mutex_has_waiters(lock)) {
-                struct rt_mutex_waiter *next;
-                next = rt_mutex_top_waiter(lock);
-                plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
-        }
-        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-        wake_up_process(pendowner);
 }
 /*
- * Remove a waiter from a lock
+ * Remove a waiter from a lock and give up
 *
- * Must be called with lock->wait_lock held
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
 */
 static void remove_waiter(struct rt_mutex *lock,
                          struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
        raw_spin_lock_irqsave(&current->pi_lock, flags);
        plist_del(&waiter->list_entry, &lock->wait_list);
-        waiter->task = NULL;
        current->pi_blocked_on = NULL;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
-        if (first && owner != current) {
+        if (!owner)
+                return;
+        if (first) {
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 *                       or TASK_UNINTERRUPTIBLE)
 * @timeout:             the pre-initialized and started timer, or NULL for none
 * @waiter:              the pre-initialized rt_mutex_waiter
- * @detect_deadlock:     passed to task_blocks_on_rt_mutex
 *
 * lock->wait_lock must be held by the caller.
 */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                    struct hrtimer_sleeper *timeout,
-                    struct rt_mutex_waiter *waiter,
+                    struct rt_mutex_waiter *waiter)
-                    int detect_deadlock)
 {
        int ret = 0;
        for (;;) {
                /* Try to acquire the lock: */
-                if (try_to_take_rt_mutex(lock))
+                if (try_to_take_rt_mutex(lock, current, waiter))
                        break;
                /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                                break;
                }
-                /*
-                 * waiter->task is NULL the first time we come here and
-                 * when we have been woken up by the previous owner
-                 * but the lock got stolen by a higher prio task.
-                 */
-                if (!waiter->task) {
-                        ret = task_blocks_on_rt_mutex(lock, waiter, current,
-                                                      detect_deadlock);
-                        /*
-                         * If we got woken up by the owner then start loop
-                         * all over without going into schedule to try
-                         * to get the lock now:
-                         */
-                        if (unlikely(!waiter->task)) {
-                                /*
-                                 * Reset the return value. We might
-                                 * have returned with -EDEADLK and the
-                                 * owner released the lock while we
-                                 * were walking the pi chain.
-                                 */
-                                ret = 0;
-                                continue;
-                        }
-                        if (unlikely(ret))
-                                break;
-                }
                raw_spin_unlock(&lock->wait_lock);
                debug_rt_mutex_print_deadlock(waiter);
-                if (waiter->task)
+                schedule_rt_mutex(lock);
-                        schedule_rt_mutex(lock);
                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        int ret = 0;
        debug_rt_mutex_init_waiter(&waiter);
-        waiter.task = NULL;
        raw_spin_lock(&lock->wait_lock);
        /* Try to acquire the lock again: */
-        if (try_to_take_rt_mutex(lock)) {
+        if (try_to_take_rt_mutex(lock, current, NULL)) {
                raw_spin_unlock(&lock->wait_lock);
                return 0;
        }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                        timeout->task = NULL;
        }
-        ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+        ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
-                                  detect_deadlock);
+        if (likely(!ret))
+                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
        set_current_state(TASK_RUNNING);
-        if (unlikely(waiter.task))
+        if (unlikely(ret))
                remove_waiter(lock, &waiter);
        /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        if (unlikely(timeout))
                hrtimer_cancel(&timeout->timer);
-        /*
-         * Readjust priority, when we did not get the lock. We might
-         * have been the pending owner and boosted. Since we did not
-         * take the lock, the PI boost has to go.
-         */
-        if (unlikely(ret))
-                rt_mutex_adjust_prio(current);
        debug_rt_mutex_free_waiter(&waiter);
        return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
        if (likely(rt_mutex_owner(lock) != current)) {
-                ret = try_to_take_rt_mutex(lock);
+                ret = try_to_take_rt_mutex(lock, current, NULL);
                /*
                 * try_to_take_rt_mutex() sets the lock waiters
                 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 {
        __rt_mutex_init(lock, NULL);
        debug_rt_mutex_proxy_lock(lock, proxy_owner);
-        rt_mutex_set_owner(lock, proxy_owner, 0);
+        rt_mutex_set_owner(lock, proxy_owner);
        rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                           struct task_struct *proxy_owner)
 {
        debug_rt_mutex_proxy_unlock(lock);
-        rt_mutex_set_owner(lock, NULL, 0);
+        rt_mutex_set_owner(lock, NULL);
        rt_mutex_deadlock_account_unlock(proxy_owner);
 }
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        raw_spin_lock(&lock->wait_lock);
-        mark_rt_mutex_waiters(lock);
+        if (try_to_take_rt_mutex(lock, task, NULL)) {
-        if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
-                /* We got the lock for task. */
-                debug_rt_mutex_lock(lock);
-                rt_mutex_set_owner(lock, task, 0);
                raw_spin_unlock(&lock->wait_lock);
-                rt_mutex_deadlock_account_lock(lock, task);
                return 1;
        }
        ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
-        if (ret && !waiter->task) {
+        if (ret && !rt_mutex_owner(lock)) {
                /*
                 * Reset the return value. We might have
                 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                 */
                ret = 0;
        }
+        if (unlikely(ret))
+                remove_waiter(lock, waiter);
        raw_spin_unlock(&lock->wait_lock);
        debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        set_current_state(TASK_INTERRUPTIBLE);
-        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-                                  detect_deadlock);
        set_current_state(TASK_RUNNING);
-        if (unlikely(waiter->task))
+        if (unlikely(ret))
                remove_waiter(lock, waiter);
        /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        /*
-         * Readjust priority, when we did not get the lock. We might have been
-         * the pending owner and boosted. Since we did not take the lock, the
-         * PI boost has to go.
-         */
-        if (unlikely(ret))
-                rt_mutex_adjust_prio(current);
        return ret;
 }
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
 /*
 * lock->owner state tracking:
 */
-#define RT_MUTEX_OWNER_PENDING  1UL
+#define RT_MUTEX_HAS_WAITERS    1UL
-#define RT_MUTEX_HAS_WAITERS    2UL
+#define RT_MUTEX_OWNER_MASKALL  1UL
-#define RT_MUTEX_OWNER_MASKALL  3UL
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
                ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
 }
-static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
-{
-        return (struct task_struct *)
-                ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
-}
-static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
-{
-        return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
-}
 /*
 * PI-futex support (proxy locking functions, etc.):
 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..312f8b95c2d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
@@ -324,7 +323,7 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next, *last;
+        struct sched_entity *curr, *next, *last, *skip;
        unsigned int nr_spread_over;
@@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
        struct cgroup_subsys_state *css;
-        if (p->flags & PF_EXITING)
-                return &root_task_group;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
@@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq)
 #endif
 /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
 * @cpu: the processor in question.
 *
- * Returns true if the current cpu runqueue is locked.
 * This interface allows printk to be called with the runqueue lock
 * held and know whether or not it is OK to wake up the klogd.
 */
@@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
+#else /* CONFIG_SMP */
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        BUG_ON(rq1 != rq2);
+        raw_spin_lock(&rq1->lock);
+        __acquire(rq2->lock);   /* Fake it out ;) */
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        BUG_ON(rq1 != rq2);
+        raw_spin_unlock(&rq1->lock);
+        __release(rq2->lock);
+}
 #endif
 static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr)
         */
        if (hardirq_count())
                __this_cpu_add(cpu_hardirq_time, delta);
-        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                __this_cpu_add(cpu_softirq_time, delta);
        irq_time_write_end();
@@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
                sched_rt_avg_update(rq, irq_delta);
 }
+static int irqtime_account_hi_update(void)
+{
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        unsigned long flags;
+        u64 latest_ns;
+        int ret = 0;
+        local_irq_save(flags);
+        latest_ns = this_cpu_read(cpu_hardirq_time);
+        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+                ret = 1;
+        local_irq_restore(flags);
+        return ret;
+}
+static int irqtime_account_si_update(void)
+{
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        unsigned long flags;
+        u64 latest_ns;
+        int ret = 0;
+        local_irq_save(flags);
+        latest_ns = this_cpu_read(cpu_softirq_time);
+        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+                ret = 1;
+        local_irq_restore(flags);
+        return ret;
+}
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#define sched_clock_irqtime     (0)
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
        rq->clock_task += delta;
@@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p)
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
-                                       int oldprio, int running)
+                                       int oldprio)
 {
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
-                        prev_class->switched_from(rq, p, running);
+                        prev_class->switched_from(rq, p);
-                p->sched_class->switched_to(rq, p, running);
+                p->sched_class->switched_to(rq, p);
-        } else
+        } else if (oldprio != p->prio)
-                p->sched_class->prio_changed(rq, p, oldprio, running);
+                p->sched_class->prio_changed(rq, p, oldprio);
 }
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * yield - it could be a while.
                 */
                if (unlikely(on_rq)) {
-                        schedule_timeout_uninterruptible(1);
+                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                        continue;
                }
@@ -2246,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 * Cause a process which is running on another CPU to enter
 * kernel-mode, without any delay. (to get signals handled.)
 *
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
 * because all it wants to ensure is that the remote task enters
 * the kernel. If the IPI races and the task has been migrated
 * to another CPU then no harm is done and the purpose has been
@@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:          the task to evaluate
- * @func:       the function to be called
- * @info:       the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-                              void (*func) (void *info), void *info)
-{
-        int cpu;
-        preempt_disable();
-        cpu = task_cpu(p);
-        if (task_curr(p))
-                smp_call_function_single(cpu, func, info, 1);
-        preempt_enable();
-}
 #ifdef CONFIG_SMP
 /*
 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
+        p->se.vruntime                  = 0;
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2819,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+        sched_info_switch(prev, next);
+        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
+        trace_sched_switch(prev, next);
 }
 /**
@@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
-        trace_sched_switch(prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 }
 /*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                        cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+        cputime64_t tmp = cputime_to_cputime64(cputime);
+        /* Add system time to process. */
+        p->stime = cputime_add(p->stime, cputime);
+        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+        account_group_system_time(p, cputime);
+        /* Add system time to cpustat. */
+        *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+        /* Account for system time used */
+        acct_update_integrals(p);
+}
+/*
 * Account system cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-        cputime64_t tmp;
+        cputime64_t *target_cputime64;
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
                return;
        }
-        /* Add system time to process. */
-        p->stime = cputime_add(p->stime, cputime);
-        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-        account_group_system_time(p, cputime);
-        /* Add system time to cpustat. */
-        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
-                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+                target_cputime64 = &cpustat->irq;
        else if (in_serving_softirq())
-                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+                target_cputime64 = &cpustat->softirq;
        else
-                cpustat->system = cputime64_add(cpustat->system, tmp);
+                target_cputime64 = &cpustat->system;
-        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-        /* Account for system time used */
+        __account_system_time(p, cputime, cputime_scaled, target_cputime64);
-        acct_update_integrals(p);
 }
 /*
 * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
 */
 void account_steal_time(cputime_t cputime)
 {
@@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                                struct rq *rq)
+{
+        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        if (irqtime_account_hi_update()) {
+                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+        } else if (irqtime_account_si_update()) {
+                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+        } else if (this_cpu_ksoftirqd() == p) {
+                /*
+                 * ksoftirqd time do not get accounted in cpu_softirq_time.
+                 * So, we have to handle it separately here.
+                 * Also, p->stime needs to be updated for ksoftirqd.
+                 */
+                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                        &cpustat->softirq);
+        } else if (user_tick) {
+                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+        } else if (p == rq->idle) {
+                account_idle_time(cputime_one_jiffy);
+        } else if (p->flags & PF_VCPU) { /* System time or guest time */
+                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+        } else {
+                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                        &cpustat->system);
+        }
+}
+static void irqtime_account_idle_ticks(int ticks)
+{
+        int i;
+        struct rq *rq = this_rq();
+        for (i = 0; i < ticks; i++)
+                irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                                struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
+        if (sched_clock_irqtime) {
+                irqtime_account_process_tick(p, user_tick, rq);
+                return;
+        }
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks)
 */
 void account_idle_ticks(unsigned long ticks)
 {
+        if (sched_clock_irqtime) {
+                irqtime_account_idle_ticks(ticks);
+                return;
+        }
        account_idle_time(jiffies_to_cputime(ticks));
 }
@@ -3945,9 +4085,6 @@ need_resched:
        rcu_note_context_switch(cpu);
        prev = rq->curr;
-        release_kernel_lock(prev);
-need_resched_nonpreemptible:
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -3974,6 +4111,16 @@ need_resched_nonpreemptible:
                                        try_to_wake_up_local(to_wakeup);
                        }
                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        /*
+                         * If we are going to sleep and we have plugged IO queued, make
+                         * sure to submit it to avoid deadlocks.
+                         */
+                        if (blk_needs_flush_plug(prev)) {
+                                raw_spin_unlock(&rq->lock);
+                                blk_schedule_flush_plug(prev);
+                                raw_spin_lock(&rq->lock);
+                        }
                }
                switch_count = &prev->nvcsw;
        }
@@ -3989,9 +4136,6 @@ need_resched_nonpreemptible:
        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
-                sched_info_switch(prev, next);
-                perf_event_task_sched_out(prev, next);
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -4010,9 +4154,6 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(prev)))
-                goto need_resched_nonpreemptible;
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
@@ -4213,6 +4354,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
        __wake_up_common(q, mode, 1, 0, key);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 /**
 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4570,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (on_rq)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
-                check_class_changed(rq, p, prev_class, oldprio, running);
+        check_class_changed(rq, p, prev_class, oldprio);
-        }
        task_rq_unlock(rq, &flags);
 }
@@ -4761,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
        rcu_read_lock();
        pcred = __task_cred(p);
-        match = (cred->euid == pcred->euid ||
+        if (cred->user->user_ns == pcred->user->user_ns)
-                 cred->euid == pcred->uid);
+                match = (cred->euid == pcred->euid ||
+                         cred->euid == pcred->uid);
+        else
+                match = false;
        rcu_read_unlock();
        return match;
 }
@@ -4822,12 +4966,15 @@ recheck:
                            param->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
                /*
-                 * Like positive nice levels, dont allow tasks to
+                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
-                 * move out of SCHED_IDLE either:
+                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
+                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        return -EPERM;
+                        if (!can_nice(p, TASK_NICE(p)))
+                                return -EPERM;
+                }
                /* can't change other user's priorities */
                if (!check_same_owner(p))
@@ -4850,7 +4997,7 @@ recheck:
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        /*
-         * To be able to change p->policy safely, the apropriate
+         * To be able to change p->policy safely, the appropriate
         * runqueue lock must be held.
         */
        rq = __task_rq_lock(p);
@@ -4864,6 +5011,17 @@ recheck:
                return -EINVAL;
        }
+        /*
+         * If not changing anything there's no need to proceed further:
+         */
+        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+                        param->sched_priority == p->rt_priority))) {
+                __task_rq_unlock(rq);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                return 0;
+        }
 #ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
                /*
@@ -4902,11 +5060,10 @@ recheck:
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (on_rq)
                activate_task(rq, p, 0);
-                check_class_changed(rq, p, prev_class, oldprio, running);
+        check_class_changed(rq, p, prev_class, oldprio);
-        }
        __task_rq_unlock(rq);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -5088,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+        if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                goto out_unlock;
        retval = security_task_setscheduler(p);
@@ -5323,6 +5480,67 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+        struct task_struct *curr = current;
+        struct rq *rq, *p_rq;
+        unsigned long flags;
+        bool yielded = 0;
+        local_irq_save(flags);
+        rq = this_rq();
+again:
+        p_rq = task_rq(p);
+        double_rq_lock(rq, p_rq);
+        while (task_rq(p) != p_rq) {
+                double_rq_unlock(rq, p_rq);
+                goto again;
+        }
+        if (!curr->sched_class->yield_to_task)
+                goto out;
+        if (curr->sched_class != p->sched_class)
+                goto out;
+        if (task_running(p_rq, p) || p->state)
+                goto out;
+        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+        if (yielded) {
+                schedstat_inc(rq, yld_count);
+                /*
+                 * Make p's CPU reschedule; pick_next_entity takes care of
+                 * fairness.
+                 */
+                if (preempt && rq != p_rq)
+                        resched_task(p_rq->curr);
+        }
+out:
+        double_rq_unlock(rq, p_rq);
+        local_irq_restore(flags);
+        if (yielded)
+                schedule();
+        return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
 /*
 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
 * that process accounting knows that this is a task in IO wait state.
@@ -5333,6 +5551,7 @@ void __sched io_schedule(void)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        schedule();
        current->in_iowait = 0;
@@ -5348,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
        current->in_iowait = 0;
@@ -5496,7 +5716,7 @@ void show_state_filter(unsigned long state_filter)
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
-                 * console might take alot of time:
+                 * console might take a lot of time:
                 */
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
@@ -5571,7 +5791,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
-        ftrace_graph_init_task(idle);
+        ftrace_graph_init_idle_task(idle, cpu);
 }
 /*
@@ -6111,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif
        }
+        update_max_interval();
        return NOTIFY_OK;
 }
@@ -7796,6 +8019,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
        INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        cfs_rq->rq = rq;
+        /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+        cfs_rq->load_stamp = 1;
+#endif
 #endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
@@ -8074,7 +8301,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+        return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8109,6 +8336,8 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+        const struct sched_class *prev_class = p->sched_class;
+        int old_prio = p->prio;
        int on_rq;
        on_rq = p->se.on_rq;
@@ -8119,6 +8348,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                activate_task(rq, p, 0);
                resched_task(rq->curr);
        }
+        check_class_changed(rq, p, prev_class, old_prio);
 }
 void normalize_rt_tasks(void)
@@ -8234,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
-        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        tg->shares = NICE_0_LOAD;
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8510,7 +8738,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                /* Propagate contribution to hierarchy */
                raw_spin_lock_irqsave(&rq->lock, flags);
                for_each_sched_entity(se)
-                        update_cfs_shares(group_cfs_rq(se), 0);
+                        update_cfs_shares(group_cfs_rq(se));
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
@@ -8884,7 +9112,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
 static void __init autogroup_init(struct task_struct *init_task)
 {
        autogroup_default.tg = &root_task_group;
-        root_task_group.autogroup = &autogroup_default;
        kref_init(&autogroup_default.kref);
        init_rwsem(&autogroup_default.lock);
        init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 static inline bool task_group_is_autogroup(struct task_group *tg)
 {
-        return tg != &root_task_group && tg->autogroup;
+        return !!tg->autogroup;
 }
 static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
        p->signal->autogroup = autogroup_kref_get(ag);
+        if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+                goto out;
        t = p;
        do {
                sched_move_task(t);
        } while_each_thread(p, t);
+out:
        unlock_task_sighand(p, &flags);
        autogroup_kref_put(prev);
 }
@@ -176,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
        struct autogroup *ag = autogroup_create();
        autogroup_move_group(p, ag);
-        /* drop extra refrence added by autogroup_create() */
+        /* drop extra reference added by autogroup_create() */
        autogroup_kref_put(ag);
 }
 EXPORT_SYMBOL(sched_autogroup_create_attach);
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 {
        struct autogroup *ag = autogroup_task_get(p);
+        if (!task_group_is_autogroup(ag->tg))
+                goto out;
        down_read(&ag->lock);
        seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
        up_read(&ag->lock);
+out:
        autogroup_kref_put(ag);
 }
 #endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
-        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (!task_group_is_autogroup(tg))
-        if (!enabled || !tg->autogroup)
                return 0;
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
 struct autogroup {
+        /*
+         * reference doesn't mean how many thread attach to this
+         * autogroup now. It just stands for the number of task
+         * could use this autogroup.
+         */
        struct kref             kref;
        struct task_group       *tg;
        struct rw_semaphore     lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        raw_spin_lock_irqsave(&rq->lock, flags);
        if (cfs_rq->rb_leftmost)
-                MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+                MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
        last = __pick_last_entity(cfs_rq);
        if (last)
                max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..6fa833ab2cb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
 #include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 /*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-/*
 * SCHED_OTHER wake-up granularity.
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
@@ -419,7 +412,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *left = cfs_rq->rb_leftmost;
@@ -429,6 +422,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
        return rb_entry(left, struct sched_entity, run_node);
 }
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+        struct rb_node *next = rb_next(&se->run_node);
+        if (!next)
+                return NULL;
+        return rb_entry(next, struct sched_entity, run_node);
+}
+#ifdef CONFIG_SCHED_DEBUG
 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +447,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 * Scheduling class statistics methods:
 */
-#ifdef CONFIG_SCHED_DEBUG
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -540,7 +543,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +736,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
            now - cfs_rq->load_last > 4 * period) {
                cfs_rq->load_period = 0;
                cfs_rq->load_avg = 0;
+                delta = period - 1;
        }
        cfs_rq->load_stamp = now;
@@ -763,16 +767,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                list_del_leaf_cfs_rq(cfs_rq);
 }
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-                                long weight_delta)
 {
        long load_weight, load, shares;
-        load = cfs_rq->load.weight + weight_delta;
+        load = cfs_rq->load.weight;
        load_weight = atomic_read(&tg->load_weight);
-        load_weight -= cfs_rq->load_contribution;
        load_weight += load;
+        load_weight -= cfs_rq->load_contribution;
        shares = (tg->shares * load);
        if (load_weight)
@@ -790,7 +793,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 {
        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
                update_cfs_load(cfs_rq, 0);
-                update_cfs_shares(cfs_rq, 0);
+                update_cfs_shares(cfs_rq);
        }
 }
 # else /* CONFIG_SMP */
@@ -798,8 +801,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-                                long weight_delta)
 {
        return tg->shares;
 }
@@ -824,7 +826,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                account_entity_enqueue(cfs_rq, se);
 }
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
        struct task_group *tg;
        struct sched_entity *se;
@@ -838,7 +840,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
        if (likely(se->load.weight == tg->shares))
                return;
 #endif
-        shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+        shares = calc_cfs_shares(cfs_rq, tg);
        reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -847,7 +849,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
@@ -978,8 +980,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         */
        update_curr(cfs_rq);
        update_cfs_load(cfs_rq, 0);
-        update_cfs_shares(cfs_rq, se->load.weight);
        account_entity_enqueue(cfs_rq, se);
+        update_cfs_shares(cfs_rq);
        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
@@ -996,19 +998,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                list_add_leaf_cfs_rq(cfs_rq);
 }
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
 {
-        if (!se || cfs_rq->last == se)
+        for_each_sched_entity(se) {
-                cfs_rq->last = NULL;
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                if (cfs_rq->last == se)
+                        cfs_rq->last = NULL;
+                else
+                        break;
+        }
+}
-        if (!se || cfs_rq->next == se)
+static void __clear_buddies_next(struct sched_entity *se)
-                cfs_rq->next = NULL;
+{
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                if (cfs_rq->next == se)
+                        cfs_rq->next = NULL;
+                else
+                        break;
+        }
+}
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                if (cfs_rq->skip == se)
+                        cfs_rq->skip = NULL;
+                else
+                        break;
+        }
 }
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        for_each_sched_entity(se)
+        if (cfs_rq->last == se)
-                __clear_buddies(cfs_rq_of(se), se);
+                __clear_buddies_last(se);
+        if (cfs_rq->next == se)
+                __clear_buddies_next(se);
+        if (cfs_rq->skip == se)
+                __clear_buddies_skip(se);
 }
 static void
@@ -1041,7 +1073,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
-        update_cfs_shares(cfs_rq, 0);
+        update_cfs_shares(cfs_rq);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1116,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                return;
        if (cfs_rq->nr_running > 1) {
-                struct sched_entity *se = __pick_next_entity(cfs_rq);
+                struct sched_entity *se = __pick_first_entity(cfs_rq);
                s64 delta = curr->vruntime - se->vruntime;
                if (delta < 0)
@@ -1128,13 +1160,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        struct sched_entity *se = __pick_first_entity(cfs_rq);
        struct sched_entity *left = se;
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+        /*
-                se = cfs_rq->next;
+         * Avoid running the skip buddy, if running something else can
+         * be done without getting too unfair.
+         */
+        if (cfs_rq->skip == se) {
+                struct sched_entity *second = __pick_next_entity(se);
+                if (second && wakeup_preempt_entity(second, left) < 1)
+                        se = second;
+        }
        /*
         * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1188,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
                se = cfs_rq->last;
+        /*
+         * Someone really wants this to run. If it's not unfair, run it.
+         */
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+                se = cfs_rq->next;
        clear_buddies(cfs_rq, se);
        return se;
@@ -1282,7 +1334,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
-                update_cfs_shares(cfs_rq, 0);
+                update_cfs_shares(cfs_rq);
        }
        hrtick_update(rq);
@@ -1312,58 +1364,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
-                update_cfs_shares(cfs_rq, 0);
+                update_cfs_shares(cfs_rq);
        }
        hrtick_update(rq);
 }
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
-        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-        struct sched_entity *rightmost, *se = &curr->se;
-        /*
-         * Are we the only task in the tree?
-         */
-        if (unlikely(cfs_rq->nr_running == 1))
-                return;
-        clear_buddies(cfs_rq, se);
-        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-                update_rq_clock(rq);
-                /*
-                 * Update run-time statistics of the 'current'.
-                 */
-                update_curr(cfs_rq);
-                return;
-        }
-        /*
-         * Find the rightmost entry in the rbtree:
-         */
-        rightmost = __pick_last_entity(cfs_rq);
-        /*
-         * Already in the rightmost position?
-         */
-        if (unlikely(!rightmost || entity_before(rightmost, se)))
-                return;
-        /*
-         * Minimally necessary key value to be last in the tree:
-         * Upon rescheduling, sched_class::put_prev_task() will place
-         * 'current' within the tree based on its new key value.
-         */
-        se->vruntime = rightmost->vruntime + 1;
-}
 #ifdef CONFIG_SMP
 static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1840,14 @@ static void set_next_buddy(struct sched_entity *se)
        }
 }
+static void set_skip_buddy(struct sched_entity *se)
+{
+        if (likely(task_of(se)->policy != SCHED_IDLE)) {
+                for_each_sched_entity(se)
+                        cfs_rq_of(se)->skip = se;
+        }
+}
 /*
 * Preempt the current task with a newly woken task if needed:
 */
@@ -1857,16 +1871,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (test_tsk_need_resched(curr))
                return;
+        /* Idle tasks are by definition preempted by non-idle tasks. */
+        if (unlikely(curr->policy == SCHED_IDLE) &&
+            likely(p->policy != SCHED_IDLE))
+                goto preempt;
        /*
-         * Batch and idle tasks do not preempt (their preemption is driven by
+         * Batch and idle tasks do not preempt non-idle tasks (their preemption
-         * the tick):
+         * is driven by the tick):
         */
        if (unlikely(p->policy != SCHED_NORMAL))
                return;
-        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE))
-                goto preempt;
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
@@ -1932,6 +1948,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
        }
 }
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        struct sched_entity *se = &curr->se;
+        /*
+         * Are we the only task in the tree?
+         */
+        if (unlikely(rq->nr_running == 1))
+                return;
+        clear_buddies(cfs_rq, se);
+        if (curr->policy != SCHED_BATCH) {
+                update_rq_clock(rq);
+                /*
+                 * Update run-time statistics of the 'current'.
+                 */
+                update_curr(cfs_rq);
+        }
+        set_skip_buddy(se);
+}
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+        struct sched_entity *se = &p->se;
+        if (!se->on_rq)
+                return false;
+        /* Tell the scheduler that we'd really like pse to run next. */
+        set_next_buddy(se);
+        yield_task_fair(rq);
+        return true;
+}
 #ifdef CONFIG_SMP
 /**************************************************
 * Fair scheduling class load-balancing methods:
@@ -2043,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              enum cpu_idle_type idle, int *all_pinned,
              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-        int loops = 0, pulled = 0, pinned = 0;
+        int loops = 0, pulled = 0;
        long rem_load_move = max_load_move;
        struct task_struct *p, *n;
        if (max_load_move == 0)
                goto out;
-        pinned = 1;
        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
                if (loops++ > sysctl_sched_nr_migrate)
                        break;
                if ((p->se.load.weight >> 1) > rem_load_move ||
-                    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+                                      all_pinned))
                        continue;
                pull_task(busiest, p, this_rq, this_cpu);
@@ -2092,9 +2152,6 @@ out:
         */
        schedstat_add(sd, lb_gained[idle], pulled);
-        if (all_pinned)
-                *all_pinned = pinned;
        return max_load_move - rem_load_move;
 }
@@ -2123,7 +2180,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
         * We need to update shares after updating tg->load_weight in
         * order to adjust the weight of groups with long running tasks.
         */
-        update_cfs_shares(cfs_rq, 0);
+        update_cfs_shares(cfs_rq);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -2610,7 +2667,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
 * @local_group: Does group contain this_cpu.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
@@ -2618,7 +2674,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 */
 static inline void update_sg_lb_stats(struct sched_domain *sd,
                        struct sched_group *group, int this_cpu,
-                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        enum cpu_idle_type idle, int load_idx,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
@@ -2638,9 +2694,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
-                if (*sd_idle && rq->nr_running)
-                        *sd_idle = 0;
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
                        if (idle_cpu(i) && !first_idle_cpu) {
@@ -2685,7 +2738,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        /*
         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of two tasks.
+         * than the average weight of a task.
         *
         * APZ: with cgroup the avg task weight can vary wildly and
         *      might not be a suitable number - should we keep a
@@ -2695,7 +2748,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2755,15 +2808,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                        enum cpu_idle_type idle, int *sd_idle,
+                        enum cpu_idle_type idle, const struct cpumask *cpus,
-                        const struct cpumask *cpus, int *balance,
+                        int *balance, struct sd_lb_stats *sds)
-                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *sg = sd->groups;
@@ -2781,7 +2832,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
                                local_group, cpus, balance, &sgs);
                if (local_group && !(*balance))
@@ -3007,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
        /*
         * if *imbalance is less than the average load per runnable task
-         * there is no gaurantee that any tasks will be moved so we'll have
+         * there is no guarantee that any tasks will be moved so we'll have
         * a think about bumping its value to force at least one task to be
         * moved
         */
@@ -3033,7 +3084,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 * @imbalance: Variable which stores amount of weighted load which should
 *              be moved to restore balance/put a group to idle.
 * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3096,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
+                   const struct cpumask *cpus, int *balance)
 {
        struct sd_lb_stats sds;
@@ -3056,22 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+        update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
-                                        balance, &sds);
+        /*
-        /* Cases where imbalance does not exist from POV of this_cpu */
+         * this_cpu is not the appropriate cpu to perform load balancing at
-        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         * this level.
-         *    at this level.
-         * 2) There is no busy sibling group to pull from.
-         * 3) This group is the busiest group.
-         * 4) This group is more busy than the avg busieness at this
-         *    sched_domain.
-         * 5) The imbalance is within the specified limit.
-         *
-         * Note: when doing newidle balance, if the local group has excess
-         * capacity (i.e. nr_running < group_capacity) and the busiest group
-         * does not have any capacity, we force a load balance to pull tasks
-         * to the local group. In this case, we skip past checks 3, 4 and 5.
         */
        if (!(*balance))
                goto ret;
@@ -3080,41 +3119,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
            check_asym_packing(sd, &sds, this_cpu, imbalance))
                return sds.busiest;
+        /* There is no busy sibling group to pull tasks from */
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
-        /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        /*
+         * If the busiest group is imbalanced the below checks don't
+         * work because they assumes all things are equal, which typically
+         * isn't true due to cpus_allowed constraints and the like.
+         */
+        if (sds.group_imb)
+                goto force_balance;
+        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
                        !sds.busiest_has_capacity)
                goto force_balance;
+        /*
+         * If the local group is more busy than the selected busiest group
+         * don't try and pull any tasks.
+         */
        if (sds.this_load >= sds.max_load)
                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        /*
+         * Don't pull any tasks if this group is already above the domain
+         * average load.
+         */
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        /*
+        if (idle == CPU_IDLE) {
-         * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
-         * And to check for busy balance use !idle_cpu instead of
-         * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
-         * even when they are idle.
-         */
-        if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
-                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                        goto out_balanced;
-        } else {
                /*
                 * This cpu is idle. If the busiest group load doesn't
                 * have more tasks than the number of available cpu's and
                 * there is no imbalance between this and busiest group
                 * wrt to idle cpu's, it is balanced.
                 */
-                if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
                    sds.busiest_nr_running <= sds.busiest_group_weight)
                        goto out_balanced;
+        } else {
+                /*
+                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+                 * imbalance_pct to be conservative.
+                 */
+                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                        goto out_balanced;
        }
 force_balance:
@@ -3193,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
                               int busiest_cpu, int this_cpu)
 {
        if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
                 * move_tasks() will succeed.  ld_moved will be true and this
                 * active balance code will not be triggered.
                 */
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                        return 0;
                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
                        return 0;
        }
@@ -3246,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        int ld_moved, all_pinned = 0, active_balance = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
@@ -3255,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle,
                                   cpus, balance);
        if (*balance == 0)
@@ -3297,6 +3337,7 @@ redo:
                 * still unbalanced. ld_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
+                all_pinned = 1;
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3330,8 +3371,7 @@ redo:
                if (idle != CPU_NEWLY_IDLE)
                        sd->nr_balance_failed++;
-                if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+                if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
-                                        this_cpu)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3426,6 @@ redo:
                        sd->balance_interval *= 2;
        }
-        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
        goto out;
 out_balanced:
@@ -3403,11 +3439,7 @@ out_one_pinned:
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+        ld_moved = 0;
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        else
-                ld_moved = 0;
 out:
        return ld_moved;
 }
@@ -3786,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick)
 static DEFINE_SPINLOCK(balancing);
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+        max_load_balance_interval = HZ*num_online_cpus()/10;
+}
 /*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
@@ -3815,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                /* scale ms to jiffies */
                interval = msecs_to_jiffies(interval);
-                if (unlikely(!interval))
+                interval = clamp(interval, 1UL, max_load_balance_interval);
-                        interval = 1;
-                if (interval > HZ*NR_CPUS/10)
-                        interval = HZ*NR_CPUS/10;
                need_serialize = sd->flags & SD_SERIALIZE;
@@ -3831,8 +3871,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                        if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
-                                 * longer idle, or one of our SMT siblings is
+                                 * longer idle.
-                                 * not idle.
                                 */
                                idle = CPU_NOT_IDLE;
                        }
@@ -4079,33 +4118,62 @@ static void task_fork_fair(struct task_struct *p)
 * Priority of the task has changed. Check to see if we preempt
 * the current task.
 */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
+static void
-                              int oldprio, int running)
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
+        if (!p->se.on_rq)
+                return;
        /*
         * Reschedule if we are currently running on this runqueue and
         * our priority decreased, or if we are not currently running on
         * this runqueue and our priority is higher than the current's
         */
-        if (running) {
+        if (rq->curr == p) {
                if (p->prio > oldprio)
                        resched_task(rq->curr);
        } else
                check_preempt_curr(rq, p, 0);
 }
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        /*
+         * Ensure the task's vruntime is normalized, so that when its
+         * switched back to the fair class the enqueue_entity(.flags=0) will
+         * do the right thing.
+         *
+         * If it was on_rq, then the dequeue_entity(.flags=0) will already
+         * have normalized the vruntime, if it was !on_rq, then only when
+         * the task is sleeping will it still have non-normalized vruntime.
+         */
+        if (!se->on_rq && p->state != TASK_RUNNING) {
+                /*
+                 * Fix up our vruntime so that the current sleep doesn't
+                 * cause 'unlimited' sleep bonus.
+                 */
+                place_entity(cfs_rq, se, 0);
+                se->vruntime -= cfs_rq->min_vruntime;
+        }
+}
 /*
 * We switched to the sched_fair class.
 */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
-                             int running)
 {
+        if (!p->se.on_rq)
+                return;
        /*
         * We were most likely switched from sched_rt, so
         * kick off the schedule if running, otherwise just see
         * if we can still preempt the current task.
         */
-        if (running)
+        if (rq->curr == p)
                resched_task(rq->curr);
        else
                check_preempt_curr(rq, p, 0);
@@ -4171,6 +4239,7 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
+        .yield_to_task          = yield_to_task_fair,
        .check_preempt_curr     = check_preempt_wakeup,
@@ -4191,6 +4260,7 @@ static const struct sched_class fair_sched_class = {
        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
+        .switched_from          = switched_from_fair,
        .switched_to            = switched_to_fair,
        .get_rr_interval        = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..a776a6396427 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
-                             int running)
 {
-        /* Can this actually happen?? */
+        BUG();
-        if (running)
-                resched_task(rq->curr);
-        else
-                check_preempt_curr(rq, p, 0);
 }
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+static void
-                              int oldprio, int running)
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        /* This can happen for hot plug CPUS */
+        BUG();
-        /*
-         * Reschedule if we are currently running on this runqueue and
-         * our priority decreased, or if we are not currently running on
-         * this runqueue and our priority is higher than the current's
-         */
-        if (running) {
-                if (p->prio > oldprio)
-                        resched_task(rq->curr);
-        } else
-                check_preempt_curr(rq, p, 0);
 }
 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
        .prio_changed           = prio_changed_idle,
        .switched_to            = switched_to_idle,
-        /* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ad6267714c84..e7cebdc65f82 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-        int this_cpu = smp_processor_id();
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
        struct sched_rt_entity *rt_se;
-        rt_se = rt_rq->tg->rt_se[this_cpu];
+        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+        rt_se = rt_rq->tg->rt_se[cpu];
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
@@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-        int this_cpu = smp_processor_id();
        struct sched_rt_entity *rt_se;
+        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-        rt_se = rt_rq->tg->rt_se[this_cpu];
+        rt_se = rt_rq->tg->rt_se[cpu];
        if (rt_se && on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
@@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                } else if (rt_rq->rt_nr_running)
+                } else if (rt_rq->rt_nr_running) {
                        idle = 0;
+                        if (!rt_rq_throttled(rt_rq))
+                                enqueue = 1;
+                }
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
@@ -1374,7 +1378,7 @@ retry:
                task = pick_next_pushable_task(rq);
                if (task_cpu(next_task) == rq->cpu && task == next_task) {
                        /*
-                         * If we get here, the task hasnt moved at all, but
+                         * If we get here, the task hasn't moved at all, but
                         * it has failed to push.  We will not try again,
                         * since the other cpus will pull from us when they
                         * are ready.
@@ -1484,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq)
                        /*
                         * We continue with the search, just in
                         * case there's an even higher prio task
-                         * in another runqueue. (low likelyhood
+                         * in another runqueue. (low likelihood
                         * but possible)
                         */
                }
@@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
 * When switch from the rt queue, we bring ourselves to a position
 * that we might want to pull RT tasks from other runqueues.
 */
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
-                           int running)
 {
        /*
         * If there are other RT tasks then we will reschedule
@@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (!rq->rt.rt_nr_running)
+        if (p->se.on_rq && !rq->rt.rt_nr_running)
                pull_rt_task(rq);
 }
@@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void)
 * with RT tasks. In this case we try to push them off to
 * other runqueues.
 */
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
-                           int running)
 {
        int check_resched = 1;
@@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
         * If that current running task is also an RT task
         * then see if we can move to another run queue.
         */
-        if (!running) {
+        if (p->se.on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->rt.overloaded && push_rt_task(rq) &&
                    /* Don't resched if we changed runqueues */
@@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
 * Priority of the task has changed. This may cause
 * us to initiate a push or pull.
 */
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+static void
-                            int oldprio, int running)
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (running) {
+        if (!p->se.on_rq)
+                return;
+        if (rq->curr == p) {
 #ifdef CONFIG_SMP
                /*
                 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..1ba2bd40fdac 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
 {
 }
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
-                             int running)
 {
        BUG(); /* its impossible to change to this class */
 }
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
+static void
-                              int oldprio, int running)
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
 {
        BUG(); /* how!?, what priority? */
 }
@@ -103,6 +102,4 @@ static const struct sched_class stop_sched_class = {
        .prio_changed           = prio_changed_stop,
        .switched_to            = switched_to_stop,
-        /* no .task_new for stop tasks */
 };
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..7165af5f1b11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig)
 /*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to stop the target task from exiting
+ *   appropriate lock must be held to stop the target task from exiting
 */
 static struct sigqueue *
 __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
        return !tracehook_consider_fatal_signal(tsk, sig);
 }
+/*
-/* Notify the system that a driver wants to block all signals for this
+ * Notify the system that a driver wants to block all signals for this
 * process, and wants to be notified if any signals at all were to be
 * sent/acted upon.  If the notifier routine returns non-zero, then the
 * signal will be acted upon after all.  If the notifier routine returns 0,
 * then then signal will be blocked.  Only one block per process is
 * allowed.  priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.  */
+ * can use to determine if the signal should be blocked or not.
+ */
 void
 block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
 {
@@ -434,9 +434,10 @@ still_pending:
                copy_siginfo(info, &first->info);
                __sigqueue_free(first);
        } else {
-                /* Ok, it wasn't in the queue.  This must be
+                /*
-                   a fast-pathed signal or we must have been
+                 * Ok, it wasn't in the queue.  This must be
-                   out of queue space.  So zero out the info.
+                 * a fast-pathed signal or we must have been
+                 * out of queue space.  So zero out the info.
                 */
                info->si_signo = sig;
                info->si_errno = 0;
@@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 }
 /*
- * Dequeue a signal and return the element to the caller, which is 
+ * Dequeue a signal and return the element to the caller, which is
 * expected to free it.
 *
 * All callers have to hold the siglock.
@@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * itimers are process shared and we restart periodic
                 * itimers in the signal delivery path to prevent DoS
                 * attacks in the high resolution timer case. This is
-                 * compliant with the old way of self restarting
+                 * compliant with the old way of self-restarting
                 * itimers, as the SIGALRM is a legacy signal and only
                 * queued once. Changing the restart behaviour to
                 * restart the timer in the signal dequeue path is
@@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info)
 }
 /*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+        const struct cred *cred = current_cred();
+        const struct cred *tcred = __task_cred(t);
+        if (cred->user->user_ns == tcred->user->user_ns &&
+            (cred->euid == tcred->suid ||
+             cred->euid == tcred->uid ||
+             cred->uid  == tcred->suid ||
+             cred->uid  == tcred->uid))
+                return 1;
+        if (ns_capable(tcred->user->user_ns, CAP_KILL))
+                return 1;
+        return 0;
+}
+/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
 static int check_kill_permission(int sig, struct siginfo *info,
                                 struct task_struct *t)
 {
-        const struct cred *cred, *tcred;
        struct pid *sid;
        int error;
@@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (error)
                return error;
-        cred = current_cred();
-        tcred = __task_cred(t);
        if (!same_thread_group(current, t) &&
-            (cred->euid ^ tcred->suid) &&
+            !kill_ok_by_cred(t)) {
-            (cred->euid ^ tcred->uid) &&
-            (cred->uid  ^ tcred->suid) &&
-            (cred->uid  ^ tcred->uid) &&
-            !capable(CAP_KILL)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
@@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        if (info == SEND_SIG_FORCED)
                goto out_set;
-        /* Real-time signals must be queued if sent by sigqueue, or
+        /*
-           some other real-time mechanism.  It is implementation
+         * Real-time signals must be queued if sent by sigqueue, or
-           defined whether kill() does so.  We attempt to do so, on
+         * some other real-time mechanism.  It is implementation
-           the principle of least surprise, but since kill is not
+         * defined whether kill() does so.  We attempt to do so, on
-           allowed to fail with EAGAIN when low on memory we just
+         * the principle of least surprise, but since kill is not
-           make sure at least one signal gets delivered and don't
+         * allowed to fail with EAGAIN when low on memory we just
-           pass on the info struct.  */
+         * make sure at least one signal gets delivered and don't
+         * pass on the info struct.
+         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
@@ -1187,8 +1203,7 @@ retry:
        return error;
 }
-int
+int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
        int error;
        rcu_read_lock();
@@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 * These are for backward compatibility with the rest of the kernel source.
 */
-int
+int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
        /*
         * Make sure legacy kernel users don't send in bad values
@@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid);
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
 * afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions".  In the case of Posix Timers
+ * expirations or I/O completions".  In the case of POSIX Timers
 * we allocate the sigqueue structure from the timer_create.  If this
 * allocation fails we are able to report the failure to the application
 * with an EAGAIN error.
@@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
-         * see comment in do_notify_parent() abot the following 3 lines
+         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void)
 }
 /*
- * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Return non-zero if there is a SIGKILL that should be waking us up.
 * Called with the siglock held.
 */
 static int sigkill_pending(struct task_struct *tsk)
@@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code)
 /*
 * This performs the stopping for SIGSTOP and other stop signals.
 * We have to stop all threads in the thread group.
- * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns non-zero if we've actually stopped and released the siglock.
 * Returns zero if we didn't stop and still hold the siglock.
 */
 static int do_signal_stop(int signr)
@@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
        current->exit_code = 0;
-        /* Update the siginfo structure if the signal has
+        /*
-           changed.  If the debugger wanted something
+         * Update the siginfo structure if the signal has
-           specific in the siginfo structure then it should
+         * changed.  If the debugger wanted something
-           have updated *info via PTRACE_SETSIGINFO.  */
+         * specific in the siginfo structure then it should
+         * have updated *info via PTRACE_SETSIGINFO.
+         */
        if (signr != info->si_signo) {
                info->si_signo = signr;
                info->si_errno = 0;
@@ -1871,7 +1887,7 @@ relock:
        for (;;) {
                struct k_sigaction *ka;
                /*
-                 * Tracing can induce an artifical signal and choose sigaction.
+                 * Tracing can induce an artificial signal and choose sigaction.
                 * The return value in @signr determines the default action,
                 * but @info->si_signo is the signal number we will report.
                 */
@@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk)
        if (!signal_pending(tsk))
                goto out;
-        /* It could be that __group_complete_signal() choose us to
+        /*
+         * It could be that __group_complete_signal() choose us to
         * notify about group-wide signal. Another thread should be
         * woken now to take the signal since we will not.
         */
@@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals);
 * System call entry points.
 */
+/**
+ *  sys_restart_syscall - restart a system call
+ */
 SYSCALL_DEFINE0(restart_syscall)
 {
        struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
        return error;
 }
+/**
+ *  sys_rt_sigprocmask - change the list of currently blocked signals
+ *  @how: whether to add, remove, or set signals
+ *  @set: stores pending signals
+ *  @oset: previous value of signal mask if non-null
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
                sigset_t __user *, oset, size_t, sigsetsize)
 {
@@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
 out:
        return error;
-}       
+}
+/**
+ *  sys_rt_sigpending - examine a pending signal that has been raised
+ *                      while blocked
+ *  @set: stores pending signals
+ *  @sigsetsize: size of sigset_t type or larger
+ */
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
 {
        return do_sigpending(set, sigsetsize);
@@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
                err |= __put_user(from->si_trapno, &to->si_trapno);
 #endif
 #ifdef BUS_MCEERR_AO
-                /* 
+                /*
                 * Other callers might not initialize the si_lsb field,
-                 * so check explicitely for the right codes here.
+                 * so check explicitly for the right codes here.
                 */
                if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
                        err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 #endif
+/**
+ *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
+ *                      in @uthese
+ *  @uthese: queued signals to wait for
+ *  @uinfo: if non-null, the signal's siginfo is returned here
+ *  @uts: upper bound on process time suspension
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo, const struct timespec __user *, uts,
                size_t, sigsetsize)
@@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;
-                
        /*
         * Invert the set of allowed signals to get those we
         * want to block.
@@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                                   + (ts.tv_sec || ts.tv_nsec));
                if (timeout) {
-                        /* None ready -- temporarily unblock those we're
+                        /*
+                         * None ready -- temporarily unblock those we're
                         * interested while we are sleeping in so that we'll
-                         * be awakened when they arrive.  */
+                         * be awakened when they arrive.
+                         */
                        current->real_blocked = current->blocked;
                        sigandsets(&current->blocked, &current->blocked, &these);
                        recalc_sigpending();
@@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
        return ret;
 }
+/**
+ *  sys_kill - send a signal to a process
+ *  @pid: the PID of the process
+ *  @sig: signal to be sent
+ */
 SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
        struct siginfo info;
@@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
        return do_tkill(tgid, pid, sig);
 }
-/*
+/**
+ *  sys_tkill - send signal to one specific task
+ *  @pid: the PID of the task
+ *  @sig: signal to be sent
+ *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
 SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
        return do_tkill(0, pid, sig);
 }
+/**
+ *  sys_rt_sigqueueinfo - send signal information to a signal
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *  @uinfo: signal info to be sent
+ */
 SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
 {
@@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                return -EFAULT;
        /* Not even root can pretend to send signals from the kernel.
-           Nor can they impersonate a kill(), which adds source info.  */
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
-        if (info.si_code >= 0)
+         */
+        if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info.si_code < 0);
                return -EPERM;
+        }
        info.si_signo = sig;
        /* POSIX.1b doesn't mention process groups.  */
@@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
                return -EINVAL;
        /* Not even root can pretend to send signals from the kernel.
-           Nor can they impersonate a kill(), which adds source info.  */
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
-        if (info->si_code >= 0)
+         */
+        if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
+        }
        info->si_signo = sig;
        return do_send_specific(tgid, pid, sig, info);
@@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
                error = -EINVAL;
                /*
-                 *
+                 * Note - this code used to test ss_flags incorrectly:
-                 * Note - this code used to test ss_flags incorrectly
                 *        old code may have been written using ss_flags==0
                 *        to mean ss_flags==SS_ONSTACK (as this was the only
                 *        way that worked) - this fix preserves that older
-                 *        mechanism
+                 *        mechanism.
                 */
                if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
                        goto out;
@@ -2570,6 +2635,10 @@ out:
 #ifdef __ARCH_WANT_SYS_SIGPENDING
+/**
+ *  sys_sigpending - examine pending signals
+ *  @set: where mask of pending signal is returned
+ */
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
        return do_sigpending(set, sizeof(*set));
@@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #endif
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/* Some platforms have their own version with special arguments others
+/**
-   support only sys_rt_sigprocmask.  */
+ *  sys_sigprocmask - examine and change blocked signals
+ *  @how: whether to add, remove, or set signals
+ *  @set: signals to add or remove (if non-null)
+ *  @oset: previous value of signal mask if non-null
+ *
+ * Some platforms have their own version with special arguments;
+ * others support only sys_rt_sigprocmask.
+ */
 SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
                old_sigset_t __user *, oset)
@@ -2632,6 +2708,13 @@ out:
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
 #ifdef __ARCH_WANT_SYS_RT_SIGACTION
+/**
+ *  sys_rt_sigaction - alter an action taken by a process
+ *  @sig: signal to be sent
+ *  @act: new sigaction
+ *  @oact: used to save the previous sigaction
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
@@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause)
 #endif
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+/**
+ *  sys_rt_sigsuspend - replace the signal mask for a value with the
+ *      @unewset value until a signal is received
+ *  @unewset: new signal mask value
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 {
        sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
         */
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
-                void (*func) (void *info);
+                smp_call_func_t func;
                /*
                 * Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
                if (atomic_read(&data->refs) == 0)
                        continue;
-                func = data->csd.func;                  /* for later warn */
+                func = data->csd.func;          /* save for later warn */
-                data->csd.func(data->csd.info);
+                func(data->csd.info);
                /*
-                 * If the cpu mask is not still set then it enabled interrupts,
+                 * If the cpu mask is not still set then func enabled
-                 * we took another smp interrupt, and executed the function
+                 * interrupts (BUG), and this cpu took another smp call
-                 * twice on this cpu.  In theory that copy decremented refs.
+                 * function interrupt and executed func(info) twice
+                 * on this cpu.  That nested execution decremented refs.
                 */
                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
-                        WARN(1, "%pS enabled interrupts and double executed\n",
+                        WARN(1, "%pf enabled interrupts and double executed\n", func);
-                             func);
                        continue;
                }
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
 {
        struct call_function_data *data;
        unsigned long flags;
-        int cpu, next_cpu, this_cpu = smp_processor_id();
+        int refs, cpu, next_cpu, this_cpu = smp_processor_id();
        /*
         * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress && !early_boot_irqs_disabled);
-        /* So, what's a CPU they want? Ignoring this one. */
+        /* Try to fastpath.  So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        /* This BUG_ON verifies our reuse assertions and can be removed */
        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
+        /*
+         * The global call function queue list add and delete are protected
+         * by a lock, but the list is traversed without any lock, relying
+         * on the rcu list add and delete to allow safe concurrent traversal.
+         * We reuse the call function data without waiting for any grace
+         * period after some other cpu removes it from the global queue.
+         * This means a cpu might find our data block as it is being
+         * filled out.
+         *
+         * We hold off the interrupt handler on the other cpu by
+         * ordering our writes to the cpu mask vs our setting of the
+         * refs counter.  We assert only the cpu owning the data block
+         * will set a bit in cpumask, and each bit will only be cleared
+         * by the subject cpu.  Each cpu must first find its bit is
+         * set and then check that refs is set indicating the element is
+         * ready to be processed, otherwise it must skip the entry.
+         *
+         * On the previous iteration refs was set to 0 by another cpu.
+         * To avoid the use of transitivity, set the counter to 0 here
+         * so the wmb will pair with the rmb in the interrupt handler.
+         */
+        atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
        data->csd.func = func;
        data->csd.info = info;
-        cpumask_and(data->cpumask, mask, cpu_online_mask);
-        cpumask_clear_cpu(this_cpu, data->cpumask);
-        /*
+        /* Ensure 0 refs is visible before mask.  Also orders func and info */
-         * To ensure the interrupt handler gets an complete view
-         * we order the cpumask and refs writes and order the read
-         * of them in the interrupt handler.  In addition we may
-         * only clear our own cpu bit from the mask.
-         */
        smp_wmb();
-        atomic_set(&data->refs, cpumask_weight(data->cpumask));
+        /* We rely on the "and" being processed before the store */
+        cpumask_and(data->cpumask, mask, cpu_online_mask);
+        cpumask_clear_cpu(this_cpu, data->cpumask);
+        refs = cpumask_weight(data->cpumask);
+        /* Some callers race with other cpus changing the passed mask */
+        if (unlikely(!refs)) {
+                csd_unlock(&data->csd);
+                return;
+        }
        raw_spin_lock_irqsave(&call_function.lock, flags);
        /*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
         * will not miss any other list entries:
         */
        list_add_rcu(&data->csd.list, &call_function.queue);
+        /*
+         * We rely on the wmb() in list_add_rcu to complete our writes
+         * to the cpumask before this write to refs, which indicates
+         * data is on the list and is ready to be processed.
+         */
+        atomic_set(&data->refs, refs);
        raw_spin_unlock_irqrestore(&call_function.lock, flags);
        /*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
 }
 #endif /* USE_GENERIC_SMP_HELPERS */
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+void __weak arch_disable_smp_support(void) { }
+static int __init nosmp(char *str)
+{
+        setup_max_cpus = 0;
+        arch_disable_smp_support();
+        return 0;
+}
+early_param("nosmp", nosmp);
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+        int nr_cpus;
+        get_option(&str, &nr_cpus);
+        if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+                nr_cpu_ids = nr_cpus;
+        return 0;
+}
+early_param("nr_cpus", nrcpus);
+static int __init maxcpus(char *str)
+{
+        get_option(&str, &setup_max_cpus);
+        if (setup_max_cpus == 0)
+                arch_disable_smp_support();
+        return 0;
+}
+early_param("maxcpus", maxcpus);
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+        unsigned int cpu;
+        /* FIXME: This should be done in userspace --RR */
+        for_each_present_cpu(cpu) {
+                if (num_online_cpus() >= setup_max_cpus)
+                        break;
+                if (!cpu_online(cpu))
+                        cpu_up(cpu);
+        }
+        /* Any cleanup work */
+        printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+        smp_cpus_done(setup_max_cpus);
+}
 /*
 * Call a function on all processors.  May be used during early boot while
 * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..174f976c2874 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 char *softirq_to_name[NR_SOFTIRQS] = {
        "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -311,9 +311,21 @@ void irq_enter(void)
 }
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-# define invoke_softirq()       __do_softirq()
+static inline void invoke_softirq(void)
+{
+        if (!force_irqthreads)
+                __do_softirq();
+        else
+                wakeup_softirqd();
+}
 #else
-# define invoke_softirq()       do_softirq()
+static inline void invoke_softirq(void)
+{
+        if (!force_irqthreads)
+                do_softirq();
+        else
+                wakeup_softirqd();
+}
 #endif
 /*
@@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
 /**
 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
 * @ttimer:      tasklet_hrtimer which is initialized
- * @function:    hrtimer callback funtion which gets called from softirq context
+ * @function:    hrtimer callback function which gets called from softirq context
 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
 * @mode:        hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
 */
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
        set_current_state(TASK_INTERRUPTIBLE);
-        current->flags |= PF_KSOFTIRQD;
        while (!kthread_should_stop()) {
                preempt_disable();
                if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
                           don't process */
                        if (cpu_is_offline((long)__bind_cpu))
                                goto wait_to_die;
-                        do_softirq();
+                        local_irq_disable();
+                        if (local_softirq_pending())
+                                __do_softirq();
+                        local_irq_enable();
                        preempt_enable_no_resched();
                        cond_resched();
                        preempt_disable();
@@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+                p = kthread_create_on_node(run_ksoftirqd,
+                                           hcpu,
+                                           cpu_to_node(hotcpu),
+                                           "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
                        return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                BUG_ON(stopper->thread || stopper->enabled ||
                       !list_empty(&stopper->works));
-                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                p = kthread_create_on_node(cpu_stopper_thread,
-                                   cpu);
+                                           stopper,
+                                           cpu_to_node(cpu),
+                                           "migration/%d", cpu);
                if (IS_ERR(p))
                        return notifier_from_errno(PTR_ERR(p));
                get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
 void (*pm_power_off_prepare)(void);
 /*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+        if (pcred->user->user_ns == cred->user->user_ns &&
+            (pcred->uid  == cred->euid ||
+             pcred->euid == cred->euid))
+                return true;
+        if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+                return true;
+        return false;
+}
+/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
        int no_nice;
-        if (pcred->uid  != cred->euid &&
+        if (!set_one_prio_perm(p)) {
-            pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
                error = -EPERM;
                goto out;
        }
@@ -298,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
        system_state = SYSTEM_RESTART;
        device_shutdown();
        sysdev_shutdown();
+        syscore_shutdown();
 }
 /**
@@ -336,6 +355,7 @@ void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
        sysdev_shutdown();
+        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
        kmsg_dump(KMSG_DUMP_HALT);
        machine_halt();
@@ -355,6 +375,7 @@ void kernel_power_off(void)
                pm_power_off_prepare();
        disable_nonboot_cpus();
        sysdev_shutdown();
+        syscore_shutdown();
        printk(KERN_EMERG "Power down.\n");
        kmsg_dump(KMSG_DUMP_POWEROFF);
        machine_power_off();
@@ -502,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        if (rgid != (gid_t) -1) {
                if (old->gid == rgid ||
                    old->egid == rgid ||
-                    capable(CAP_SETGID))
+                    nsown_capable(CAP_SETGID))
                        new->gid = rgid;
                else
                        goto error;
@@ -511,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
                if (old->gid == egid ||
                    old->egid == egid ||
                    old->sgid == egid ||
-                    capable(CAP_SETGID))
+                    nsown_capable(CAP_SETGID))
                        new->egid = egid;
                else
                        goto error;
@@ -546,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        old = current_cred();
        retval = -EPERM;
-        if (capable(CAP_SETGID))
+        if (nsown_capable(CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = gid;
        else if (gid == old->gid || gid == old->sgid)
                new->egid = new->fsgid = gid;
@@ -613,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                new->uid = ruid;
                if (old->uid != ruid &&
                    old->euid != ruid &&
-                    !capable(CAP_SETUID))
+                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
@@ -622,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                if (old->uid != euid &&
                    old->euid != euid &&
                    old->suid != euid &&
-                    !capable(CAP_SETUID))
+                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
@@ -670,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        old = current_cred();
        retval = -EPERM;
-        if (capable(CAP_SETUID)) {
+        if (nsown_capable(CAP_SETUID)) {
                new->suid = new->uid = uid;
                if (uid != old->uid) {
                        retval = set_user(new);
@@ -712,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        old = current_cred();
        retval = -EPERM;
-        if (!capable(CAP_SETUID)) {
+        if (!nsown_capable(CAP_SETUID)) {
                if (ruid != (uid_t) -1 && ruid != old->uid &&
                    ruid != old->euid  && ruid != old->suid)
                        goto error;
@@ -776,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        old = current_cred();
        retval = -EPERM;
-        if (!capable(CAP_SETGID)) {
+        if (!nsown_capable(CAP_SETGID)) {
                if (rgid != (gid_t) -1 && rgid != old->gid &&
                    rgid != old->egid  && rgid != old->sgid)
                        goto error;
@@ -836,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        if (uid == old->uid  || uid == old->euid  ||
            uid == old->suid || uid == old->fsuid ||
-            capable(CAP_SETUID)) {
+            nsown_capable(CAP_SETUID)) {
                if (uid != old_fsuid) {
                        new->fsuid = uid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        if (gid == old->gid  || gid == old->egid  ||
            gid == old->sgid || gid == old->fsgid ||
-            capable(CAP_SETGID)) {
+            nsown_capable(CAP_SETGID)) {
                if (gid != old_fsgid) {
                        new->fsgid = gid;
                        goto change_okay;
@@ -1177,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        down_write(&uts_sem);
@@ -1226,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
@@ -1341,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
+                /* Keep the capable check against init_user_ns until
+                   cgroups can contain all limits */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
@@ -1384,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
 {
        const struct cred *cred = current_cred(), *tcred;
-        tcred = __task_cred(task);
+        if (current == task)
-        if (current != task &&
+                return 0;
-            (cred->uid != tcred->euid ||
-             cred->uid != tcred->suid ||
-             cred->uid != tcred->uid  ||
-             cred->gid != tcred->egid ||
-             cred->gid != tcred->sgid ||
-             cred->gid != tcred->gid) &&
-             !capable(CAP_SYS_RESOURCE)) {
-                return -EPERM;
-        }
-        return 0;
+        tcred = __task_cred(task);
+        if (cred->user->user_ns == tcred->user->user_ns &&
+            (cred->uid == tcred->euid &&
+             cred->uid == tcred->suid &&
+             cred->uid == tcred->uid  &&
+             cred->gid == tcred->egid &&
+             cred->gid == tcred->sgid &&
+             cred->gid == tcred->gid))
+                return 0;
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+                return 0;
+        return -EPERM;
 }
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -194,9 +200,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
-        .count = 1,
+        {{.count = 1,
        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
+        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
        .root = &sysctl_table_root,
        .set = &sysctl_table_root.default_set,
 };
@@ -361,20 +367,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rt_handler,
        },
-        {
-                .procname       = "sched_compat_yield",
-                .data           = &sysctl_sched_compat_yield,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
 #ifdef CONFIG_SCHED_AUTOGROUP
        {
                .procname       = "sched_autogroup_enabled",
                .data           = &sysctl_sched_autogroup_enabled,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
                .extra2         = &one,
        },
@@ -713,7 +712,7 @@ static struct ctl_table kern_table[] = {
                .data           = &kptr_restrict,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_dmesg_restrict,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -948,7 +947,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_perf_event_sample_rate,
                .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = perf_proc_update_handler,
        },
 #endif
 #ifdef CONFIG_KMEMCHECK
@@ -978,14 +977,18 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_overcommit_memory,
                .maxlen         = sizeof(sysctl_overcommit_memory),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
        },
        {
                .procname       = "panic_on_oom",
                .data           = &sysctl_panic_on_oom,
                .maxlen         = sizeof(sysctl_panic_on_oom),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
        },
        {
                .procname       = "oom_kill_allocating_task",
@@ -1013,7 +1016,8 @@ static struct ctl_table vm_table[] = {
                .data           = &page_cluster,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "dirty_background_ratio",
@@ -1061,7 +1065,8 @@ static struct ctl_table vm_table[] = {
                .data           = &dirty_expire_interval,
                .maxlen         = sizeof(dirty_expire_interval),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "nr_pdflush_threads",
@@ -1137,6 +1142,8 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = drop_caches_sysctl_handler,
+                .extra1         = &one,
+                .extra2         = &three,
        },
 #ifdef CONFIG_COMPACTION
        {
@@ -1567,11 +1574,16 @@ void sysctl_head_get(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
+static void free_head(struct rcu_head *rcu)
+{
+        kfree(container_of(rcu, struct ctl_table_header, rcu));
+}
 void sysctl_head_put(struct ctl_table_header *head)
 {
        spin_lock(&sysctl_lock);
        if (!--head->count)
-                kfree(head);
+                call_rcu(&head->rcu, free_head);
        spin_unlock(&sysctl_lock);
 }
@@ -1685,13 +1697,8 @@ static int test_perm(int mode, int op)
 int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
-        int error;
        int mode;
-        error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
-        if (error)
-                return error;
        if (root->permissions)
                mode = root->permissions(root, current->nsproxy, table);
        else
@@ -1948,10 +1955,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        start_unregistering(header);
        if (!--header->parent->count) {
                WARN_ON(1);
-                kfree(header->parent);
+                call_rcu(&header->parent->rcu, free_head);
        }
        if (!--header->count)
-                kfree(header);
+                call_rcu(&header->rcu, free_head);
        spin_unlock(&sysctl_lock);
 }
@@ -2392,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
        return err;
 }
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
 struct do_proc_dointvec_minmax_conv_param {
        int *min;
        int *max;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
        const struct bin_table *table = NULL;
-        struct nameidata nd;
        struct vfsmount *mnt;
        struct file *file;
        ssize_t result;
        char *pathname;
        int flags;
-        int acc_mode;
        pathname = sysctl_getname(name, nlen, &table);
        result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        /* How should the sysctl be accessed? */
        if (oldval && oldlen && newval && newlen) {
                flags = O_RDWR;
-                acc_mode = MAY_READ | MAY_WRITE;
        } else if (newval && newlen) {
                flags = O_WRONLY;
-                acc_mode = MAY_WRITE;
        } else if (oldval && oldlen) {
                flags = O_RDONLY;
-                acc_mode = MAY_READ;
        } else {
                result = 0;
                goto out_putname;
        }
        mnt = current->nsproxy->pid_ns->proc_mnt;
-        result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
+        file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
-        if (result)
-                goto out_putname;
-        result = may_open(&nd.path, acc_mode, flags);
-        if (result)
-                goto out_putpath;
-        file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
        result = PTR_ERR(file);
        if (IS_ERR(file))
                goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
        putname(pathname);
 out:
        return result;
-out_putpath:
-        path_put(&nd.path);
-        goto out_putname;
 }
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                const char *fail = NULL;
                if (table->parent) {
-                        if (table->procname && !table->parent->procname)
+                        if (!table->parent->procname)
                                set_fail(&fail, table, "Parent without procname");
                }
-                if (!table->procname)
-                        set_fail(&fail, table, "No procname");
                if (table->child) {
                        if (table->data)
                                set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                        set_fail(&fail, table, "No maxlen");
                        }
 #ifdef CONFIG_PROC_SYSCTL
-                        if (table->procname && !table->proc_handler)
+                        if (!table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
 #endif
-#if 0
-                        if (!table->procname && table->proc_handler)
-                                set_fail(&fail, table, "proc_handler without procname");
-#endif
                        sysctl_check_leaf(namespaces, table, &fail);
                }
                if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
                goto err_cgroup_ops;
        family_registered = 1;
-        printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+        pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
        return 0;
 err_cgroup_ops:
        genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
 * various programs will get confused when the clock gets warped.
 */
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 {
        static int firsttime = 1;
        int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
 }
 /**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:  nsecs in u64
 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 */
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
 {
 #if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
-#if (BITS_PER_LONG < 64)
+/**
-u64 get_jiffies_64(void)
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:  nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
 {
-        unsigned long seq;
+        return (unsigned long)nsecs_to_jiffies64(n);
-        u64 ret;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                ret = jiffies_64;
-        } while (read_seqretry(&xtime_lock, seq));
-        return ret;
 }
-EXPORT_SYMBOL(get_jiffies_64);
-#endif
-EXPORT_SYMBOL(jiffies);
 /*
 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..b0425991e9ac 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..0d74b9ba90c8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
-#include <linux/tick.h>
 #include "tick-internal.h"
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
 ************************************************************************/
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
+#include <linux/module.h>
 #include <linux/init.h>
+#include "tick-internal.h"
 /* The Jiffies based clocksource is the lowest common
 * denominator clock source which should function on
 * all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
 * inaccuracies caused by missed or lost timer
 * interrupts and the inability for the timer
 * interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not reccomended
+ * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
 #define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
        .shift          = JIFFIES_SHIFT,
 };
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+        unsigned long seq;
+        u64 ret;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                ret = jiffies_64;
+        } while (read_seqretry(&xtime_lock, seq));
+        return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+EXPORT_SYMBOL(jiffies);
 static int __init init_jiffies_clocksource(void)
 {
        return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
 #include <linux/mm.h>
 #include <linux/module.h>
+#include "tick-internal.h"
 /*
 * NTP timekeeping variables:
 */
@@ -646,6 +648,19 @@ int do_adjtimex(struct timex *txc)
                        hrtimer_cancel(&leap_timer);
        }
+        if (txc->modes & ADJ_SETOFFSET) {
+                struct timespec delta;
+                delta.tv_sec  = txc->time.tv_sec;
+                delta.tv_nsec = txc->time.tv_usec;
+                if (!capable(CAP_SYS_TIME))
+                        return -EPERM;
+                if (!(txc->modes & ADJ_NANO))
+                        delta.tv_nsec *= 1000;
+                result = timekeeping_inject_offset(&delta);
+                if (result)
+                        return result;
+        }
        getnstimeofday(&ts);
        write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..c340ca658f37
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+static void delete_clock(struct kref *kref);
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+        struct posix_clock *clk = fp->private_data;
+        down_read(&clk->rwsem);
+        if (!clk->zombie)
+                return clk;
+        up_read(&clk->rwsem);
+        return NULL;
+}
+static void put_posix_clock(struct posix_clock *clk)
+{
+        up_read(&clk->rwsem);
+}
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -EINVAL;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.read)
+                err = clk->ops.read(clk, fp->f_flags, buf, count);
+        put_posix_clock(clk);
+        return err;
+}
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int result = 0;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.poll)
+                result = clk->ops.poll(clk, fp, wait);
+        put_posix_clock(clk);
+        return result;
+}
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = 0;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.fasync)
+                err = clk->ops.fasync(clk, fd, fp, on);
+        put_posix_clock(clk);
+        return err;
+}
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -ENODEV;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.mmap)
+                err = clk->ops.mmap(clk, vma);
+        put_posix_clock(clk);
+        return err;
+}
+static long posix_clock_ioctl(struct file *fp,
+                              unsigned int cmd, unsigned long arg)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -ENOTTY;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.ioctl)
+                err = clk->ops.ioctl(clk, cmd, arg);
+        put_posix_clock(clk);
+        return err;
+}
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+                                     unsigned int cmd, unsigned long arg)
+{
+        struct posix_clock *clk = get_posix_clock(fp);
+        int err = -ENOTTY;
+        if (!clk)
+                return -ENODEV;
+        if (clk->ops.ioctl)
+                err = clk->ops.ioctl(clk, cmd, arg);
+        put_posix_clock(clk);
+        return err;
+}
+#endif
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+        int err;
+        struct posix_clock *clk =
+                container_of(inode->i_cdev, struct posix_clock, cdev);
+        down_read(&clk->rwsem);
+        if (clk->zombie) {
+                err = -ENODEV;
+                goto out;
+        }
+        if (clk->ops.open)
+                err = clk->ops.open(clk, fp->f_mode);
+        else
+                err = 0;
+        if (!err) {
+                kref_get(&clk->kref);
+                fp->private_data = clk;
+        }
+out:
+        up_read(&clk->rwsem);
+        return err;
+}
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+        struct posix_clock *clk = fp->private_data;
+        int err = 0;
+        if (clk->ops.release)
+                err = clk->ops.release(clk);
+        kref_put(&clk->kref, delete_clock);
+        fp->private_data = NULL;
+        return err;
+}
+static const struct file_operations posix_clock_file_operations = {
+        .owner          = THIS_MODULE,
+        .llseek         = no_llseek,
+        .read           = posix_clock_read,
+        .poll           = posix_clock_poll,
+        .unlocked_ioctl = posix_clock_ioctl,
+        .open           = posix_clock_open,
+        .release        = posix_clock_release,
+        .fasync         = posix_clock_fasync,
+        .mmap           = posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = posix_clock_compat_ioctl,
+#endif
+};
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+        int err;
+        kref_init(&clk->kref);
+        init_rwsem(&clk->rwsem);
+        cdev_init(&clk->cdev, &posix_clock_file_operations);
+        clk->cdev.owner = clk->ops.owner;
+        err = cdev_add(&clk->cdev, devid, 1);
+        return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+static void delete_clock(struct kref *kref)
+{
+        struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+        if (clk->release)
+                clk->release(clk);
+}
+void posix_clock_unregister(struct posix_clock *clk)
+{
+        cdev_del(&clk->cdev);
+        down_write(&clk->rwsem);
+        clk->zombie = true;
+        up_write(&clk->rwsem);
+        kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+struct posix_clock_desc {
+        struct file *fp;
+        struct posix_clock *clk;
+};
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+        struct file *fp = fget(CLOCKID_TO_FD(id));
+        int err = -EINVAL;
+        if (!fp)
+                return err;
+        if (fp->f_op->open != posix_clock_open || !fp->private_data)
+                goto out;
+        cd->fp = fp;
+        cd->clk = get_posix_clock(fp);
+        err = cd->clk ? 0 : -ENODEV;
+out:
+        if (err)
+                fput(fp);
+        return err;
+}
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+        put_posix_clock(cd->clk);
+        fput(cd->fp);
+}
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+                err = -EACCES;
+                goto out;
+        }
+        if (cd.clk->ops.clock_adjtime)
+                err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+        else
+                err = -EOPNOTSUPP;
+out:
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.clock_gettime)
+                err = cd.clk->ops.clock_gettime(cd.clk, ts);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.clock_getres)
+                err = cd.clk->ops.clock_getres(cd.clk, ts);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+                err = -EACCES;
+                goto out;
+        }
+        if (cd.clk->ops.clock_settime)
+                err = cd.clk->ops.clock_settime(cd.clk, ts);
+        else
+                err = -EOPNOTSUPP;
+out:
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_timer_create(struct k_itimer *kit)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.timer_create)
+                err = cd.clk->ops.timer_create(cd.clk, kit);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static int pc_timer_delete(struct k_itimer *kit)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.timer_delete)
+                err = cd.clk->ops.timer_delete(cd.clk, kit);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        if (get_clock_desc(id, &cd))
+                return;
+        if (cd.clk->ops.timer_gettime)
+                cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+        put_clock_desc(&cd);
+}
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+                            struct itimerspec *ts, struct itimerspec *old)
+{
+        clockid_t id = kit->it_clock;
+        struct posix_clock_desc cd;
+        int err;
+        err = get_clock_desc(id, &cd);
+        if (err)
+                return err;
+        if (cd.clk->ops.timer_settime)
+                err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+        else
+                err = -EOPNOTSUPP;
+        put_clock_desc(&cd);
+        return err;
+}
+struct k_clock clock_posix_dynamic = {
+        .clock_getres   = pc_clock_getres,
+        .clock_set      = pc_clock_settime,
+        .clock_get      = pc_clock_gettime,
+        .clock_adj      = pc_clock_adjtime,
+        .timer_create   = pc_timer_create,
+        .timer_set      = pc_timer_settime,
+        .timer_del      = pc_timer_delete,
+        .timer_get      = pc_timer_gettime,
+};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..da800ffa810c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include "tick-internal.h"
@@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void)
        return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
 }
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+        struct clock_event_device *bc = tick_broadcast_device.evtdev;
+        return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include <asm/irq_regs.h>
@@ -51,7 +50,11 @@ int tick_is_oneshot_available(void)
 {
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+        if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                return 0;
+        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+                return 1;
+        return tick_broadcast_oneshot_available();
 }
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
 /*
 * tick internal variable and functions used by low/high res code
 */
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 #define TICK_DO_TIMER_NONE      -1
 #define TICK_DO_TIMER_BOOT      -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
 # endif /* !BROADCAST */
 #else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
        return 0;
 }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
 #endif /* !TICK_ONESHOT */
 /*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 {
        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
+#endif
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include "tick-internal.h"
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include <linux/module.h>
 #include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..8ad5d576755e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
 #include <linux/time.h>
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
 {
        struct timespec ts_delta;
        unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
 EXPORT_SYMBOL(do_settimeofday);
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv:         pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+        unsigned long flags;
+        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+                return -EINVAL;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_forward_now();
+        xtime = timespec_add(xtime, *ts);
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+        timekeeper.ntp_error = 0;
+        ntp_clear();
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        /* signal hrtimers about time change */
+        clock_was_set();
+        return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
 /**
 * change_clocksource - Swaps clocksources if a new one is available
 *
@@ -561,13 +597,12 @@ static struct timespec timekeeping_suspend_time;
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev:        unused
 *
 * This is for the generic clocksource timekeeping.
 * xtime/wall_to_monotonic/jiffies/etc are
 * still managed by arch specific suspend/resume code.
 */
-static int timekeeping_resume(struct sys_device *dev)
+static void timekeeping_resume(void)
 {
        unsigned long flags;
        struct timespec ts;
@@ -596,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev)
        /* Resume hrtimers */
        hres_timers_resume();
-        return 0;
 }
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+static int timekeeping_suspend(void)
 {
        unsigned long flags;
@@ -618,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 }
 /* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
+static struct syscore_ops timekeeping_syscore_ops = {
-        .name           = "timekeeping",
        .resume         = timekeeping_resume,
        .suspend        = timekeeping_suspend,
 };
-static struct sys_device device_timer = {
+static int __init timekeeping_init_ops(void)
-        .id             = 0,
-        .cls            = &timekeeping_sysclass,
-};
-static int __init timekeeping_init_device(void)
 {
-        int error = sysdev_class_register(&timekeeping_sysclass);
+        register_syscore_ops(&timekeeping_syscore_ops);
-        if (!error)
+        return 0;
-                error = sysdev_register(&device_timer);
-        return error;
 }
-device_initcall(timekeeping_init_device);
+device_initcall(timekeeping_init_ops);
 /*
 * If the error is already larger, we look ahead even further
@@ -779,7 +804,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 *
 * Called from the timer interrupt, must hold a write on xtime_lock.
 */
-void update_wall_time(void)
+static void update_wall_time(void)
 {
        struct clocksource *clock;
        cycle_t offset;
@@ -871,7 +896,7 @@ void update_wall_time(void)
 * getboottime - Return the real time of system boot.
 * @ts:         pointer to the timespec to be set
 *
- * Returns the time of day in a timespec.
+ * Returns the wall-time of boot in a timespec.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +914,55 @@ void getboottime(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(getboottime);
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+        struct timespec tomono, sleep;
+        unsigned int seq;
+        s64 nsecs;
+        WARN_ON(timekeeping_suspended);
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *ts = xtime;
+                tomono = wall_to_monotonic;
+                sleep = total_sleep_time;
+                nsecs = timekeeping_get_ns();
+        } while (read_seqretry(&xtime_lock, seq));
+        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+        struct timespec ts;
+        get_monotonic_boottime(&ts);
+        return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
 * @ts:         pointer to the timespec to be converted
@@ -910,11 +984,6 @@ struct timespec __current_kernel_time(void)
        return xtime;
 }
-struct timespec __get_wall_to_monotonic(void)
-{
-        return wall_to_monotonic;
-}
 struct timespec current_kernel_time(void)
 {
        struct timespec now;
@@ -946,3 +1015,48 @@ struct timespec get_monotonic_coarse(void)
                                now.tv_nsec + mono.tv_nsec);
        return now;
 }
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+        jiffies_64 += ticks;
+        update_wall_time();
+        calc_global_load(ticks);
+}
+/**
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ *    and sleep offsets.
+ * @xtim:       pointer to timespec to be set with xtime
+ * @wtom:       pointer to timespec to be set with wall_to_monotonic
+ * @sleep:      pointer to timespec to be set with time in suspend
+ */
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+                                struct timespec *wtom, struct timespec *sleep)
+{
+        unsigned long seq;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *xtim = xtime;
+                *wtom = wall_to_monotonic;
+                *sleep = total_sleep_time;
+        } while (read_seqretry(&xtime_lock, seq));
+}
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks:      number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+        write_seqlock(&xtime_lock);
+        do_timer(ticks);
+        write_sequnlock(&xtime_lock);
+}
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
                              unsigned int timer_flag)
 {
        /*
-         * It doesnt matter which lock we take:
+         * It doesn't matter which lock we take:
         */
        raw_spinlock_t *lock;
        struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
 static struct debug_obj_descr timer_debug_descr;
+static void *timer_debug_hint(void *addr)
+{
+        return ((struct timer_list *) addr)->function;
+}
 /*
 * fixup_init is called when:
 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr timer_debug_descr = {
        .name           = "timer_list",
+        .debug_hint     = timer_debug_hint,
        .fixup_init     = timer_fixup_init,
        .fixup_activate = timer_fixup_activate,
        .fixup_free     = timer_fixup_free,
@@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 * add_timer_on(). Upon exit the timer is not queued and the handler is
 * not running on any CPU.
 *
+ * Note: You must not hold locks that are held in interrupt context
+ *   while calling this function. Even if the lock has nothing to do
+ *   with the timer in question.  Here's why:
+ *
+ *    CPU0                             CPU1
+ *    ----                             ----
+ *                                   <SOFTIRQ>
+ *                                   call_timer_fn();
+ *                                     base->running_timer = mytimer;
+ *  spin_lock_irq(somelock);
+ *                                     <IRQ>
+ *                                        spin_lock(somelock);
+ *  del_timer_sync(mytimer);
+ *   while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
 * The function returns whether it has deactivated a pending timer or not.
 */
 int del_timer_sync(struct timer_list *timer)
@@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer)
 #ifdef CONFIG_LOCKDEP
        unsigned long flags;
+        /*
+         * If lockdep gives a backtrace here, please reference
+         * the synchronization rules above.
+         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
@@ -1295,19 +1324,6 @@ void run_local_timers(void)
        raise_softirq(TIMER_SOFTIRQ);
 }
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
- */
-void do_timer(unsigned long ticks)
-{
-        jiffies_64 += ticks;
-        update_wall_time();
-        calc_global_load(ticks);
-}
 #ifdef __ARCH_WANT_SYS_ALARM
 /*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER if !ARM_UNWIND && !S390
+        select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
-          /sys/kernel/debug/tracing/profile_annotated_branch
+          /sys/kernel/debug/tracing/trace_stat/branch_annotated
          Note: this will add a significant overhead; only turn this
          on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
          taken in the kernel is recorded whether it hit or miss.
          The results will be displayed in:
-          /sys/kernel/debug/tracing/profile_branch
+          /sys/kernel/debug/tracing/trace_stat/branch_all
          This option also enables the likely/unlikely profiler.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d95721f33702..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
 *
 **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-                                    u32 what)
+                             u32 what)
 {
        struct blk_trace *bt = q->blk_trace;
-        int rw = rq->cmd_flags & 0x03;
        if (likely(!bt))
                return;
-        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= REQ_DISCARD;
-        if (rq->cmd_flags & REQ_SECURE)
-                rw |= REQ_SECURE;
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                what |= BLK_TC_ACT(BLK_TC_PC);
-                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
                                what, rq->errors, rq->cmd_len, rq->cmd);
        } else  {
                what |= BLK_TC_ACT(BLK_TC_FS);
-                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-                                what, rq->errors, 0, NULL);
+                                rq->cmd_flags, what, rq->errors, 0, NULL);
        }
 }
@@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+                                    unsigned int depth, bool explicit)
 {
        struct blk_trace *bt = q->blk_trace;
        if (bt) {
-                unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+                __be64 rpdu = cpu_to_be64(depth);
-                __be64 rpdu = cpu_to_be64(pdu);
+                u32 what;
-                __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+                if (explicit)
-                                sizeof(rpdu), &rpdu);
+                        what = BLK_TA_UNPLUG_IO;
-        }
+                else
-}
+                        what = BLK_TA_UNPLUG_TIMER;
-static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
-{
-        struct blk_trace *bt = q->blk_trace;
-        if (bt) {
-                unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-                __be64 rpdu = cpu_to_be64(pdu);
-                __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+                __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-                                sizeof(rpdu), &rpdu);
        }
 }
@@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void)
        WARN_ON(ret);
        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+        ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
-        WARN_ON(ret);
-        ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        WARN_ON(ret);
        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
@@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void)
        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        unregister_trace_block_split(blk_add_trace_split, NULL);
-        unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+        unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
-        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
        unregister_trace_block_plug(blk_add_trace_plug, NULL);
        unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1827,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        rwbs[i] = '\0';
 }
-void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
-{
-        int rw = rq->cmd_flags & 0x03;
-        int bytes;
-        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= REQ_DISCARD;
-        if (rq->cmd_flags & REQ_SECURE)
-                rw |= REQ_SECURE;
-        bytes = blk_rq_bytes(rq);
-        blk_fill_rwbs(rwbs, rw, bytes);
-}
 #endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..ee24fa1935ac 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod)
                p->flags = 0L;
                /*
-                 * Do the initial record convertion from mcount jump
+                 * Do the initial record conversion from mcount jump
                 * to the NOP instructions.
                 */
                if (!ftrace_code_disable(mod, p)) {
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                return t_hash_next(m, pos);
        (*pos)++;
-        iter->pos = *pos;
+        iter->pos = iter->func_pos = *pos;
        if (iter->flags & FTRACE_ITER_PRINTALL)
                return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        if (!rec)
                return t_hash_start(m, pos);
-        iter->func_pos = *pos;
        iter->func = rec;
        return iter;
@@ -3328,7 +3327,7 @@ static int start_graph_tracing(void)
        /* The cpu_boot init_task->ret_stack will never be freed */
        for_each_online_cpu(cpu) {
                if (!idle_task(cpu)->ret_stack)
-                        ftrace_graph_init_task(idle_task(cpu));
+                        ftrace_graph_init_idle_task(idle_task(cpu), cpu);
        }
        do {
@@ -3418,6 +3417,49 @@ void unregister_ftrace_graph(void)
        mutex_unlock(&ftrace_lock);
 }
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+        atomic_set(&t->tracing_graph_pause, 0);
+        atomic_set(&t->trace_overrun, 0);
+        t->ftrace_timestamp = 0;
+        /* make curr_ret_stack visible before we add the ret_stack */
+        smp_wmb();
+        t->ret_stack = ret_stack;
+}
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+        t->curr_ret_stack = -1;
+        /*
+         * The idle task has no parent, it either has its own
+         * stack or no stack at all.
+         */
+        if (t->ret_stack)
+                WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+        if (ftrace_graph_active) {
+                struct ftrace_ret_stack *ret_stack;
+                ret_stack = per_cpu(idle_ret_stack, cpu);
+                if (!ret_stack) {
+                        ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+                                            * sizeof(struct ftrace_ret_stack),
+                                            GFP_KERNEL);
+                        if (!ret_stack)
+                                return;
+                        per_cpu(idle_ret_stack, cpu) = ret_stack;
+                }
+                graph_init_task(t, ret_stack);
+        }
+}
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
@@ -3433,12 +3475,7 @@ void ftrace_graph_init_task(struct task_struct *t)
                                GFP_KERNEL);
                if (!ret_stack)
                        return;
-                atomic_set(&t->tracing_graph_pause, 0);
+                graph_init_task(t, ret_stack);
-                atomic_set(&t->trace_overrun, 0);
-                t->ftrace_timestamp = 0;
-                /* make curr_ret_stack visable before we add the ret_stack */
-                smp_wmb();
-                t->ret_stack = ret_stack;
        }
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..0ef7b4b2a1f7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
 */
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
-#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
 * the reader page). But if the next page is a header page,
 * its flags will be non zero.
 */
-static int inline
+static inline int
 rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                struct buffer_page *page, struct list_head *list)
 {
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+        mutex_lock(&buffer->mutex);
+        if (val)
+                buffer->flags |= RB_FL_OVERWRITE;
+        else
+                buffer->flags &= ~RB_FL_OVERWRITE;
+        mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
        return local_read(&bpage->entries) & RB_WRITE_MASK;
 }
-/* Size is determined by what has been commited */
+/* Size is determined by what has been committed */
 static inline unsigned rb_page_size(struct buffer_page *bpage)
 {
        return rb_page_commit(bpage);
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
+                        int local_clock_stable = 1;
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+                        local_clock_stable = sched_clock_stable;
+#endif
                        WARN_ONCE(delta > (1ULL << 59),
-                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
                                  (unsigned long long)delta,
                                  (unsigned long long)ts,
-                                  (unsigned long long)cpu_buffer->write_stamp);
+                                  (unsigned long long)cpu_buffer->write_stamp,
+                                  local_clock_stable ? "" :
+                                  "If you just came from a suspend/resume,\n"
+                                  "please switch to the trace global clock:\n"
+                                  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
                        add_timestamp = 1;
                }
        }
@@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        /*
         * cpu_buffer->pages just needs to point to the buffer, it
         *  has no specific buffer page to point to. Lets move it out
-         *  of our way so we don't accidently swap it.
+         *  of our way so we don't accidentally swap it.
         */
        cpu_buffer->pages = reader->list.prev;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..1cb49be7c7fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
 #include "trace.h"
 #include "trace_output.h"
-#define TRACE_BUFFER_FLAGS      (RB_FL_OVERWRITE)
 /*
 * On boot up, the ring buffer is set to the minimum size, so that
 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
        "sleep-time",
        "graph-time",
        "record-cmd",
+        "overwrite",
        NULL
 };
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
                tracing_reset_online_cpus(tr);
                current_trace = type;
+                /* If we expanded the buffers, make sure the max is expanded too */
+                if (ring_buffer_expanded && type->use_max_tr)
+                        ring_buffer_resize(max_tr.buffer, trace_buf_size);
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
                ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
                /* Only reset on passing, to avoid touching corrupted buffers */
                tracing_reset_online_cpus(tr);
+                /* Shrink the max buffer again */
+                if (ring_buffer_expanded && type->use_max_tr)
+                        ring_buffer_resize(max_tr.buffer, 1);
                printk(KERN_CONT "PASSED\n");
        }
 #endif
@@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
-        entry->lock_depth               = (tsk) ? tsk->lock_depth : 0;
+        entry->padding                  = 0;
        entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#                | / _----=> need-resched    \n");
        seq_puts(m, "#                || / _---=> hardirq/softirq \n");
        seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-        seq_puts(m, "#                |||| /_--=> lock-depth       \n");
+        seq_puts(m, "#                |||| /     delay             \n");
-        seq_puts(m, "#                |||||/     delay             \n");
+        seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-        seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n");
+        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
-        seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");
 }
 static void print_func_help_header(struct seq_file *m)
@@ -2529,6 +2536,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);
+        if (mask == TRACE_ITER_OVERWRITE)
+                ring_buffer_change_overwrite(global_trace.buffer, enabled);
 }
 static ssize_t
@@ -2710,6 +2720,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
        mutex_lock(&trace_types_lock);
        if (tracer_enabled ^ val) {
+                /* Only need to warn if this is used to change the state */
+                WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
                if (val) {
                        tracer_enabled = 1;
                        if (current_trace->start)
@@ -3226,7 +3240,7 @@ waitagain:
                trace_seq_init(&iter->seq);
        /*
-         * If there was nothing to send to user, inspite of consuming trace
+         * If there was nothing to send to user, in spite of consuming trace
         * entries, go back to wait for more entries.
         */
        if (sret == -EBUSY)
@@ -4551,9 +4565,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 __init static int tracer_alloc_buffers(void)
 {
        int ring_buf_size;
+        enum ring_buffer_flags rb_flags;
        int i;
        int ret = -ENOMEM;
        if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
                goto out;
@@ -4566,12 +4582,13 @@ __init static int tracer_alloc_buffers(void)
        else
                ring_buf_size = 1;
+        rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
        cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
        cpumask_copy(tracing_cpumask, cpu_all_mask);
        /* TODO: make the number of buffers hot pluggable with CPUS */
-        global_trace.buffer = ring_buffer_alloc(ring_buf_size,
+        global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
-                                                   TRACE_BUFFER_FLAGS);
        if (!global_trace.buffer) {
                printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
                WARN_ON(1);
@@ -4581,7 +4598,7 @@ __init static int tracer_alloc_buffers(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
-        max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
+        max_tr.buffer = ring_buffer_alloc(1, rb_flags);
        if (!max_tr.buffer) {
                printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
                WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
        /* If you handled the flag setting, return 0 */
        int                     (*set_flag)(u32 old_flags, u32 bit, int set);
        struct tracer           *next;
-        int                     print_max;
        struct tracer_flags     *flags;
+        int                     print_max;
        int                     use_max_tr;
 };
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
        TRACE_ITER_SLEEP_TIME           = 0x40000,
        TRACE_ITER_GRAPH_TIME           = 0x80000,
        TRACE_ITER_RECORD_CMD           = 0x100000,
+        TRACE_ITER_OVERWRITE            = 0x200000,
 };
 /*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
 };
 struct event_filter {
-        int                     n_preds;
+        int                     n_preds;        /* Number assigned */
-        struct filter_pred      **preds;
+        int                     a_preds;        /* allocated */
+        struct filter_pred      *preds;
+        struct filter_pred      *root;
        char                    *filter_string;
 };
@@ -674,11 +677,23 @@ struct event_subsystem {
        int                     nr_events;
 };
+#define FILTER_PRED_INVALID     ((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT    (1 << 15)
+#define FILTER_PRED_FOLD        (1 << 15)
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED         16384
 struct filter_pred;
 struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-                                 int val1, int val2);
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
@@ -700,11 +715,23 @@ struct filter_pred {
        filter_pred_fn_t        fn;
        u64                     val;
        struct regex            regex;
-        char                    *field_name;
+        /*
+         * Leaf nodes use field_name, ops is used by AND and OR
+         * nodes. The field_name is always freed when freeing a pred.
+         * We can overload field_name for ops and have it freed
+         * as well.
+         */
+        union {
+                char            *field_name;
+                unsigned short  *ops;
+        };
        int                     offset;
        int                     not;
        int                     op;
-        int                     pop_n;
+        unsigned short          index;
+        unsigned short          parent;
+        unsigned short          left;
+        unsigned short          right;
 };
 extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
 }
 /*
- * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * trace_clock(): 'between' trace clock. Not completely serialized,
 * but not completely incorrect when crossing CPUs either.
 *
 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
 *        in the structure.
 *
 *   * for structures within structures, the format of the internal
- *      structure is layed out. This allows the internal structure
+ *      structure is laid out. This allows the internal structure
 *      to be deciphered for the format file. Although these macros
 *      may become out of sync with the internal structure, they
 *      will create a compile error if it happens. Since the
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
 */
 #define FTRACE_CTX_FIELDS                                       \
        __field(        unsigned int,   prev_pid        )       \
+        __field(        unsigned int,   next_pid        )       \
+        __field(        unsigned int,   next_cpu        )       \
        __field(        unsigned char,  prev_prio       )       \
        __field(        unsigned char,  prev_state      )       \
-        __field(        unsigned int,   next_pid        )       \
        __field(        unsigned char,  next_prio       )       \
-        __field(        unsigned char,  next_state      )       \
+        __field(        unsigned char,  next_state      )
-        __field(        unsigned int,   next_cpu        )
 FTRACE_ENTRY(context_switch, ctx_switch_entry,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..2fe110341359 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,7 @@ static int trace_define_common_fields(void)
        __common_field(unsigned char, flags);
        __common_field(unsigned char, preempt_count);
        __common_field(int, pid);
-        __common_field(int, lock_depth);
+        __common_field(int, padding);
        return ret;
 }
@@ -326,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
 {
        return __ftrace_set_clr_event(NULL, system, event, set);
 }
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE          127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
        } operand;
 };
+struct pred_stack {
+        struct filter_pred      **preds;
+        int                     index;
+};
 #define DEFINE_COMPARISON_PRED(type)                                    \
-static int filter_pred_##type(struct filter_pred *pred, void *event,    \
+static int filter_pred_##type(struct filter_pred *pred, void *event)    \
-                              int val1, int val2)                       \
 {                                                                       \
        type *addr = (type *)(event + pred->offset);                    \
        type val = (type)pred->val;                                     \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event,	\
 }
 #define DEFINE_EQUALITY_PRED(size)                                      \
-static int filter_pred_##size(struct filter_pred *pred, void *event,    \
+static int filter_pred_##size(struct filter_pred *pred, void *event)    \
-                              int val1, int val2)                       \
 {                                                                       \
        u##size *addr = (u##size *)(event + pred->offset);              \
        u##size val = (u##size)pred->val;                               \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
 DEFINE_EQUALITY_PRED(16);
 DEFINE_EQUALITY_PRED(8);
-static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
-                           void *event __attribute((unused)),
-                           int val1, int val2)
-{
-        return val1 && val2;
-}
-static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
-                          void *event __attribute((unused)),
-                          int val1, int val2)
-{
-        return val1 || val2;
-}
 /* Filter predicate for fixed sized arrays of characters */
-static int filter_pred_string(struct filter_pred *pred, void *event,
+static int filter_pred_string(struct filter_pred *pred, void *event)
-                              int val1, int val2)
 {
        char *addr = (char *)(event + pred->offset);
        int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 }
 /* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event,
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
-                             int val1, int val2)
 {
        char **addr = (char **)(event + pred->offset);
        int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
 * and add it to the address of the entry, and at last we have
 * the address of the string.
 */
-static int filter_pred_strloc(struct filter_pred *pred, void *event,
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
-                              int val1, int val2)
 {
        u32 str_item = *(u32 *)(event + pred->offset);
        int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
        return match;
 }
-static int filter_pred_none(struct filter_pred *pred, void *event,
+static int filter_pred_none(struct filter_pred *pred, void *event)
-                            int val1, int val2)
 {
        return 0;
 }
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
        pred->not ^= not;
 }
+enum move_type {
+        MOVE_DOWN,
+        MOVE_UP_FROM_LEFT,
+        MOVE_UP_FROM_RIGHT
+};
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+                int index, enum move_type *move)
+{
+        if (pred->parent & FILTER_PRED_IS_RIGHT)
+                *move = MOVE_UP_FROM_RIGHT;
+        else
+                *move = MOVE_UP_FROM_LEFT;
+        pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+        return pred;
+}
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+                       struct filter_pred *op, void *rec)
+{
+        struct filter_pred *pred;
+        int match = 0;
+        int type;
+        int i;
+        /*
+         * Micro-optimization: We set type to true if op
+         * is an OR and false otherwise (AND). Then we
+         * just need to test if the match is equal to
+         * the type, and if it is, we can short circuit the
+         * rest of the checks:
+         *
+         * if ((match && op->op == OP_OR) ||
+         *     (!match && op->op == OP_AND))
+         *        return match;
+         */
+        type = op->op == OP_OR;
+        for (i = 0; i < op->val; i++) {
+                pred = &preds[op->ops[i]];
+                match = pred->fn(pred, rec);
+                if (!!match == type)
+                        return match;
+        }
+        return match;
+}
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-        int match, top = 0, val1 = 0, val2 = 0;
+        int match = -1;
-        int stack[MAX_FILTER_PRED];
+        enum move_type move = MOVE_DOWN;
+        struct filter_pred *preds;
        struct filter_pred *pred;
-        int i;
+        struct filter_pred *root;
+        int n_preds;
+        int done = 0;
+        /* no filter is considered a match */
+        if (!filter)
+                return 1;
+        n_preds = filter->n_preds;
+        if (!n_preds)
+                return 1;
+        /*
+         * n_preds, root and filter->preds are protect with preemption disabled.
+         */
+        preds = rcu_dereference_sched(filter->preds);
+        root = rcu_dereference_sched(filter->root);
+        if (!root)
+                return 1;
+        pred = root;
-        for (i = 0; i < filter->n_preds; i++) {
+        /* match is currently meaningless */
-                pred = filter->preds[i];
+        match = -1;
-                if (!pred->pop_n) {
-                        match = pred->fn(pred, rec, val1, val2);
+        do {
-                        stack[top++] = match;
+                switch (move) {
+                case MOVE_DOWN:
+                        /* only AND and OR have children */
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                /* If ops is set, then it was folded. */
+                                if (!pred->ops) {
+                                        /* keep going to down the left side */
+                                        pred = &preds[pred->left];
+                                        continue;
+                                }
+                                /* We can treat folded ops as a leaf node */
+                                match = process_ops(preds, pred, rec);
+                        } else
+                                match = pred->fn(pred, rec);
+                        /* If this pred is the only pred */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        /*
+                         * Check for short circuits.
+                         *
+                         * Optimization: !!match == (pred->op == OP_OR)
+                         *   is the same as:
+                         * if ((match && pred->op == OP_OR) ||
+                         *     (!match && pred->op == OP_AND))
+                         */
+                        if (!!match == (pred->op == OP_OR)) {
+                                if (pred == root)
+                                        break;
+                                pred = get_pred_parent(pred, preds,
+                                                       pred->parent, &move);
+                                continue;
+                        }
+                        /* now go down the right side of the tree. */
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        /* We finished this equation. */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
                        continue;
                }
-                if (pred->pop_n > top) {
+                done = 1;
-                        WARN_ON_ONCE(1);
+        } while (!done);
-                        return 0;
-                }
-                val1 = stack[--top];
-                val2 = stack[--top];
-                match = pred->fn(pred, rec, val1, val2);
-                stack[top++] = match;
-        }
-        return stack[--top];
+        return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
 static void remove_filter_string(struct event_filter *filter)
 {
+        if (!filter)
+                return;
        kfree(filter->filter_string);
        filter->filter_string = NULL;
 }
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
 void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-        struct event_filter *filter = call->filter;
+        struct event_filter *filter;
        mutex_lock(&event_mutex);
+        filter = call->filter;
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 void print_subsystem_event_filter(struct event_subsystem *system,
                                  struct trace_seq *s)
 {
-        struct event_filter *filter = system->filter;
+        struct event_filter *filter;
        mutex_lock(&event_mutex);
+        filter = system->filter;
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
        pred->regex.len = 0;
 }
-static int filter_set_pred(struct filter_pred *dest,
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+        stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+        if (!stack->preds)
+                return -ENOMEM;
+        stack->index = n_preds;
+        return 0;
+}
+static void __free_pred_stack(struct pred_stack *stack)
+{
+        kfree(stack->preds);
+        stack->index = 0;
+}
+static int __push_pred_stack(struct pred_stack *stack,
+                             struct filter_pred *pred)
+{
+        int index = stack->index;
+        if (WARN_ON(index == 0))
+                return -ENOSPC;
+        stack->preds[--index] = pred;
+        stack->index = index;
+        return 0;
+}
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+        struct filter_pred *pred;
+        int index = stack->index;
+        pred = stack->preds[index++];
+        if (!pred)
+                return NULL;
+        stack->index = index;
+        return pred;
+}
+static int filter_set_pred(struct event_filter *filter,
+                           int idx,
+                           struct pred_stack *stack,
                           struct filter_pred *src,
                           filter_pred_fn_t fn)
 {
+        struct filter_pred *dest = &filter->preds[idx];
+        struct filter_pred *left;
+        struct filter_pred *right;
        *dest = *src;
        if (src->field_name) {
                dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
                        return -ENOMEM;
        }
        dest->fn = fn;
+        dest->index = idx;
-        return 0;
+        if (dest->op == OP_OR || dest->op == OP_AND) {
+                right = __pop_pred_stack(stack);
+                left = __pop_pred_stack(stack);
+                if (!left || !right)
+                        return -EINVAL;
+                /*
+                 * If both children can be folded
+                 * and they are the same op as this op or a leaf,
+                 * then this op can be folded.
+                 */
+                if (left->index & FILTER_PRED_FOLD &&
+                    (left->op == dest->op ||
+                     left->left == FILTER_PRED_INVALID) &&
+                    right->index & FILTER_PRED_FOLD &&
+                    (right->op == dest->op ||
+                     right->left == FILTER_PRED_INVALID))
+                        dest->index |= FILTER_PRED_FOLD;
+                dest->left = left->index & ~FILTER_PRED_FOLD;
+                dest->right = right->index & ~FILTER_PRED_FOLD;
+                left->parent = dest->index & ~FILTER_PRED_FOLD;
+                right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+        } else {
+                /*
+                 * Make dest->left invalid to be used as a quick
+                 * way to know this is a leaf node.
+                 */
+                dest->left = FILTER_PRED_INVALID;
+                /* All leafs allow folding the parent ops. */
+                dest->index |= FILTER_PRED_FOLD;
+        }
+        return __push_pred_stack(stack, dest);
 }
-static void filter_disable_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-        struct event_filter *filter = call->filter;
        int i;
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
+        if (filter->preds) {
+                for (i = 0; i < filter->a_preds; i++)
+                        kfree(filter->preds[i].field_name);
+                kfree(filter->preds);
+                filter->preds = NULL;
+        }
+        filter->a_preds = 0;
        filter->n_preds = 0;
-        for (i = 0; i < MAX_FILTER_PRED; i++)
-                filter->preds[i]->fn = filter_pred_none;
 }
-static void __free_preds(struct event_filter *filter)
+static void filter_disable(struct ftrace_event_call *call)
 {
-        int i;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
+}
+static void __free_filter(struct event_filter *filter)
+{
        if (!filter)
                return;
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+        __free_preds(filter);
-                if (filter->preds[i])
-                        filter_free_pred(filter->preds[i]);
-        }
-        kfree(filter->preds);
        kfree(filter->filter_string);
        kfree(filter);
 }
+/*
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
+ * the tracepoints from within it.
+ */
 void destroy_preds(struct ftrace_event_call *call)
 {
-        __free_preds(call->filter);
+        __free_filter(call->filter);
        call->filter = NULL;
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
-static struct event_filter *__alloc_preds(void)
+static struct event_filter *__alloc_filter(void)
 {
        struct event_filter *filter;
+        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+        return filter;
+}
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
        struct filter_pred *pred;
        int i;
-        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+        if (filter->preds)
-        if (!filter)
+                __free_preds(filter);
-                return ERR_PTR(-ENOMEM);
-        filter->n_preds = 0;
+        filter->preds =
+                kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
-        filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
        if (!filter->preds)
-                goto oom;
+                return -ENOMEM;
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+        filter->a_preds = n_preds;
-                pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+        filter->n_preds = 0;
-                if (!pred)
-                        goto oom;
+        for (i = 0; i < n_preds; i++) {
+                pred = &filter->preds[i];
                pred->fn = filter_pred_none;
-                filter->preds[i] = pred;
        }
-        return filter;
-oom:
-        __free_preds(filter);
-        return ERR_PTR(-ENOMEM);
-}
-static int init_preds(struct ftrace_event_call *call)
-{
-        if (call->filter)
-                return 0;
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
-        call->filter = __alloc_preds();
-        if (IS_ERR(call->filter))
-                return PTR_ERR(call->filter);
        return 0;
 }
-static int init_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
        struct ftrace_event_call *call;
-        int err;
        list_for_each_entry(call, &ftrace_events, list) {
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                err = init_preds(call);
+                filter_disable(call);
-                if (err)
+                remove_filter_string(call->filter);
-                        return err;
        }
-        return 0;
 }
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
 {
        struct ftrace_event_call *call;
        list_for_each_entry(call, &ftrace_events, list) {
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
+                __free_filter(call->filter);
-                filter_disable_preds(call);
+                call->filter = NULL;
-                remove_filter_string(call->filter);
        }
 }
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
                              struct ftrace_event_call *call,
                              struct event_filter *filter,
                              struct filter_pred *pred,
+                              struct pred_stack *stack,
                              filter_pred_fn_t fn)
 {
        int idx, err;
-        if (filter->n_preds == MAX_FILTER_PRED) {
+        if (WARN_ON(filter->n_preds == filter->a_preds)) {
                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
                return -ENOSPC;
        }
        idx = filter->n_preds;
-        filter_clear_pred(filter->preds[idx]);
+        filter_clear_pred(&filter->preds[idx]);
-        err = filter_set_pred(filter->preds[idx], pred, fn);
+        err = filter_set_pred(filter, idx, stack, pred, fn);
        if (err)
                return err;
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
                           struct ftrace_event_call *call,
                           struct event_filter *filter,
                           struct filter_pred *pred,
+                           struct pred_stack *stack,
                           bool dry_run)
 {
        struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
        unsigned long long val;
        int ret;
-        pred->fn = filter_pred_none;
+        fn = pred->fn = filter_pred_none;
-        if (pred->op == OP_AND) {
+        if (pred->op == OP_AND)
-                pred->pop_n = 2;
-                fn = filter_pred_and;
                goto add_pred_fn;
-        } else if (pred->op == OP_OR) {
+        else if (pred->op == OP_OR)
-                pred->pop_n = 2;
-                fn = filter_pred_or;
                goto add_pred_fn;
-        }
        field = find_event_field(call, pred->field_name);
        if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 add_pred_fn:
        if (!dry_run)
-                return filter_add_pred_fn(ps, call, filter, pred, fn);
+                return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
        return 0;
 }
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
        return 0;
 }
+static int count_preds(struct filter_parse_state *ps)
+{
+        struct postfix_elt *elt;
+        int n_preds = 0;
+        list_for_each_entry(elt, &ps->postfix, list) {
+                if (elt->op == OP_NONE)
+                        continue;
+                n_preds++;
+        }
+        return n_preds;
+}
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+                           struct filter_pred *root)
+{
+        struct filter_pred *preds;
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int done = 0;
+        int max;
+        /*
+         * The max that we can hit a node is three times.
+         * Once going down, once coming up from left, and
+         * once coming up from right. This is more than enough
+         * since leafs are only hit a single time.
+         */
+        max = 3 * filter->n_preds;
+        preds = filter->preds;
+        if  (!preds)
+                return -EINVAL;
+        pred = root;
+        do {
+                if (WARN_ON(count++ > max))
+                        return -EINVAL;
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        /* We are fine. */
+        return 0;
+}
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int done = 0;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                return 1;
+                        count++;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return count;
+}
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int children;
+        int done = 0;
+        /* No need to keep the fold flag */
+        root->index &= ~FILTER_PRED_FOLD;
+        /* If the root is a leaf then do nothing */
+        if (root->left == FILTER_PRED_INVALID)
+                return 0;
+        /* count the children */
+        children = count_leafs(preds, &preds[root->left]);
+        children += count_leafs(preds, &preds[root->right]);
+        root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+        if (!root->ops)
+                return -ENOMEM;
+        root->val = children;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        if (WARN_ON(count == children))
+                                return -EINVAL;
+                        pred->index &= ~FILTER_PRED_FOLD;
+                        root->ops[count++] = pred->index;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return 0;
+}
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+                           struct filter_pred *root)
+{
+        struct filter_pred *preds;
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int done = 0;
+        int err;
+        preds = filter->preds;
+        if  (!preds)
+                return -EINVAL;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->index & FILTER_PRED_FOLD) {
+                                err = fold_pred(preds, pred);
+                                if (err)
+                                        return err;
+                                /* Folded nodes are like leafs */
+                        } else if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return 0;
+}
 static int replace_preds(struct ftrace_event_call *call,
                         struct event_filter *filter,
                         struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
 {
        char *operand1 = NULL, *operand2 = NULL;
        struct filter_pred *pred;
+        struct filter_pred *root;
        struct postfix_elt *elt;
+        struct pred_stack stack = { }; /* init to NULL */
        int err;
        int n_preds = 0;
+        n_preds = count_preds(ps);
+        if (n_preds >= MAX_FILTER_PRED) {
+                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+                return -ENOSPC;
+        }
        err = check_preds(ps);
        if (err)
                return err;
+        if (!dry_run) {
+                err = __alloc_pred_stack(&stack, n_preds);
+                if (err)
+                        return err;
+                err = __alloc_preds(filter, n_preds);
+                if (err)
+                        goto fail;
+        }
+        n_preds = 0;
        list_for_each_entry(elt, &ps->postfix, list) {
                if (elt->op == OP_NONE) {
                        if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
                                operand2 = elt->operand;
                        else {
                                parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
-                                return -EINVAL;
+                                err = -EINVAL;
+                                goto fail;
                        }
                        continue;
                }
-                if (n_preds++ == MAX_FILTER_PRED) {
+                if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
                        parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-                        return -ENOSPC;
+                        err = -ENOSPC;
+                        goto fail;
                }
                if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
                if (!operand1 || !operand2) {
                        parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
-                        return -EINVAL;
+                        err = -EINVAL;
+                        goto fail;
                }
                pred = create_pred(elt->op, operand1, operand2);
 add_pred:
-                if (!pred)
+                if (!pred) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
-                err = filter_add_pred(ps, call, filter, pred, dry_run);
+                        goto fail;
+                }
+                err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
                filter_free_pred(pred);
                if (err)
-                        return err;
+                        goto fail;
                operand1 = operand2 = NULL;
        }
-        return 0;
+        if (!dry_run) {
+                /* We should have one item left on the stack */
+                pred = __pop_pred_stack(&stack);
+                if (!pred)
+                        return -EINVAL;
+                /* This item is where we start from in matching */
+                root = pred;
+                /* Make sure the stack is empty */
+                pred = __pop_pred_stack(&stack);
+                if (WARN_ON(pred)) {
+                        err = -EINVAL;
+                        filter->root = NULL;
+                        goto fail;
+                }
+                err = check_pred_tree(filter, root);
+                if (err)
+                        goto fail;
+                /* Optimize the tree */
+                err = fold_pred_tree(filter, root);
+                if (err)
+                        goto fail;
+                /* We don't set root until we know it works */
+                barrier();
+                filter->root = root;
+        }
+        err = 0;
+fail:
+        __free_pred_stack(&stack);
+        return err;
 }
+struct filter_list {
+        struct list_head        list;
+        struct event_filter     *filter;
+};
 static int replace_system_preds(struct event_subsystem *system,
                                struct filter_parse_state *ps,
                                char *filter_string)
 {
        struct ftrace_event_call *call;
+        struct filter_list *filter_item;
+        struct filter_list *tmp;
+        LIST_HEAD(filter_list);
        bool fail = true;
        int err;
        list_for_each_entry(call, &ftrace_events, list) {
-                struct event_filter *filter = call->filter;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                /* try to see if the filter can be applied */
+                /*
-                err = replace_preds(call, filter, ps, filter_string, true);
+                 * Try to see if the filter can be applied
+                 *  (filter arg is ignored on dry_run)
+                 */
+                err = replace_preds(call, NULL, ps, filter_string, true);
                if (err)
+                        goto fail;
+        }
+        list_for_each_entry(call, &ftrace_events, list) {
+                struct event_filter *filter;
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                /* really apply the filter */
+                filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
-                filter_disable_preds(call);
+                if (!filter_item)
-                err = replace_preds(call, filter, ps, filter_string, false);
+                        goto fail_mem;
+                list_add_tail(&filter_item->list, &filter_list);
+                filter_item->filter = __alloc_filter();
+                if (!filter_item->filter)
+                        goto fail_mem;
+                filter = filter_item->filter;
+                /* Can only fail on no memory */
+                err = replace_filter_string(filter, filter_string);
                if (err)
-                        filter_disable_preds(call);
+                        goto fail_mem;
-                else {
+                err = replace_preds(call, filter, ps, filter_string, false);
+                if (err) {
+                        filter_disable(call);
+                        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                        append_filter_err(ps, filter);
+                } else
                        call->flags |= TRACE_EVENT_FL_FILTERED;
-                        replace_filter_string(filter, filter_string);
+                /*
-                }
+                 * Regardless of if this returned an error, we still
+                 * replace the filter for the call.
+                 */
+                filter = call->filter;
+                call->filter = filter_item->filter;
+                filter_item->filter = filter;
                fail = false;
        }
-        if (fail) {
+        if (fail)
-                parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                goto fail;
-                return -EINVAL;
+        /*
+         * The calls can still be using the old filters.
+         * Do a synchronize_sched() to ensure all calls are
+         * done with them before we free them.
+         */
+        synchronize_sched();
+        list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+                __free_filter(filter_item->filter);
+                list_del(&filter_item->list);
+                kfree(filter_item);
        }
        return 0;
+ fail:
+        /* No call succeeded */
+        list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+                list_del(&filter_item->list);
+                kfree(filter_item);
+        }
+        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+        return -EINVAL;
+ fail_mem:
+        /* If any call succeeded, we still need to sync */
+        if (!fail)
+                synchronize_sched();
+        list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+                __free_filter(filter_item->filter);
+                list_del(&filter_item->list);
+                kfree(filter_item);
+        }
+        return -ENOMEM;
 }
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-        int err;
        struct filter_parse_state *ps;
+        struct event_filter *filter;
+        struct event_filter *tmp;
+        int err = 0;
        mutex_lock(&event_mutex);
-        err = init_preds(call);
-        if (err)
-                goto out_unlock;
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_disable_preds(call);
+                filter_disable(call);
-                remove_filter_string(call->filter);
+                filter = call->filter;
+                if (!filter)
+                        goto out_unlock;
+                call->filter = NULL;
+                /* Make sure the filter is not being used */
+                synchronize_sched();
+                __free_filter(filter);
                goto out_unlock;
        }
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (!ps)
                goto out_unlock;
-        filter_disable_preds(call);
+        filter = __alloc_filter();
-        replace_filter_string(call->filter, filter_string);
+        if (!filter) {
+                kfree(ps);
+                goto out_unlock;
+        }
+        replace_filter_string(filter, filter_string);
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
        if (err) {
-                append_filter_err(ps, call->filter);
+                append_filter_err(ps, filter);
                goto out;
        }
-        err = replace_preds(call, call->filter, ps, filter_string, false);
+        err = replace_preds(call, filter, ps, filter_string, false);
-        if (err)
+        if (err) {
-                append_filter_err(ps, call->filter);
+                filter_disable(call);
-        else
+                append_filter_err(ps, filter);
+        } else
                call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
+        /*
+         * Always swap the call filter with the new filter
+         * even if there was an error. If there was an error
+         * in the filter, we disable the filter and show the error
+         * string
+         */
+        tmp = call->filter;
+        call->filter = filter;
+        if (tmp) {
+                /* Make sure the call is done with the filter */
+                synchronize_sched();
+                __free_filter(tmp);
+        }
        filter_opstack_clear(ps);
        postfix_clear(ps);
        kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
-        int err;
        struct filter_parse_state *ps;
+        struct event_filter *filter;
+        int err = 0;
        mutex_lock(&event_mutex);
-        err = init_subsystem_preds(system);
-        if (err)
-                goto out_unlock;
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
+                filter = system->filter;
+                system->filter = NULL;
+                /* Ensure all filters are no longer used */
+                synchronize_sched();
+                filter_free_subsystem_filters(system);
+                __free_filter(filter);
                goto out_unlock;
        }
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        if (!ps)
                goto out_unlock;
-        replace_filter_string(system->filter, filter_string);
+        filter = __alloc_filter();
+        if (!filter)
+                goto out;
+        replace_filter_string(filter, filter_string);
+        /*
+         * No event actually uses the system filter
+         * we can free it without synchronize_sched().
+         */
+        __free_filter(system->filter);
+        system->filter = filter;
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
        struct event_filter *filter = event->filter;
        event->filter = NULL;
-        __free_preds(filter);
+        __free_filter(filter);
 }
 int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        if (event->filter)
                goto out_unlock;
-        filter = __alloc_preds();
+        filter = __alloc_filter();
-        if (IS_ERR(filter)) {
+        if (!filter) {
                err = PTR_ERR(filter);
                goto out_unlock;
        }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        err = -ENOMEM;
        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
        if (!ps)
-                goto free_preds;
+                goto free_filter;
        parse_init(ps, filter_ops, filter_str);
        err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
        postfix_clear(ps);
        kfree(ps);
-free_preds:
+free_filter:
        if (err)
-                __free_preds(filter);
+                __free_filter(filter);
 out_unlock:
        mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 *
 * returns 1 if
 *  - we are inside irq code
- *  - we just extered irq code
+ *  - we just entered irq code
 *
 * retunns 0 if
 *  - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..a4969b47afc1 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
 * skip the latency if the sequence has changed - some other section
 * did a maximum and could disturb our measurement with serial console
 * printouts, etc. Truly coinciding maximum latencies should be rare
- * and what happens together happens separately as well, so this doesnt
+ * and what happens together happens separately as well, so this doesn't
 * decrease the validity of the maximum found:
 */
 static __cacheline_aligned_in_smp       unsigned long max_sequence;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..35d55a386145 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
        kfree(data);
 }
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+        struct fetch_param orig;
+        unsigned char hi_shift;
+        unsigned char low_shift;
+};
+#define DEFINE_FETCH_bitfield(type)                                     \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+                                            void *data, void *dest)     \
+{                                                                       \
+        struct bitfield_fetch_param *bprm = data;                       \
+        type buf = 0;                                                   \
+        call_fetch(&bprm->orig, regs, &buf);                            \
+        if (buf) {                                                      \
+                buf <<= bprm->hi_shift;                                 \
+                buf >>= bprm->low_shift;                                \
+        }                                                               \
+        *(type *)dest = buf;                                            \
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                free_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
 /* Default (unsigned long) fetch type */
 #define __DEFAULT_FETCH_TYPE(t) u##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
        FETCH_MTD_memory,
        FETCH_MTD_symbol,
        FETCH_MTD_deref,
+        FETCH_MTD_bitfield,
        FETCH_MTD_END,
 };
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype),			\
 ASSIGN_FETCH_FUNC(memory, ftype),                       \
 ASSIGN_FETCH_FUNC(symbol, ftype),                       \
 ASSIGN_FETCH_FUNC(deref, ftype),                        \
+ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
          }                                             \
        }
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
        if (!type)
                type = DEFAULT_FETCH_TYPE_STR;
+        /* Special case: bitfield */
+        if (*type == 'b') {
+                unsigned long bs;
+                type = strchr(type, '/');
+                if (!type)
+                        goto fail;
+                type++;
+                if (strict_strtoul(type, 0, &bs))
+                        goto fail;
+                switch (bs) {
+                case 8:
+                        return find_fetch_type("u8");
+                case 16:
+                        return find_fetch_type("u16");
+                case 32:
+                        return find_fetch_type("u32");
+                case 64:
+                        return find_fetch_type("u64");
+                default:
+                        goto fail;
+                }
+        }
        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
                if (strcmp(type, fetch_type_table[i].name) == 0)
                        return &fetch_type_table[i];
+fail:
        return NULL;
 }
@@ -586,7 +649,9 @@ error:
 static void free_probe_arg(struct probe_arg *arg)
 {
-        if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                free_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
                free_deref_fetch_param(arg->fetch.data);
        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
                free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
                }
                break;
        case '+':       /* deref memory */
+                arg++;  /* Skip '+', because strict_strtol() rejects it. */
        case '-':
                tmp = strchr(arg, '(');
                if (!tmp)
                        break;
                *tmp = '\0';
-                ret = strict_strtol(arg + 1, 0, &offset);
+                ret = strict_strtol(arg, 0, &offset);
                if (ret)
                        break;
-                if (arg[0] == '-')
-                        offset = -offset;
                arg = tmp + 1;
                tmp = strrchr(arg, ')');
                if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
        return ret;
 }
+#define BYTES_TO_BITS(nb)       ((BITS_PER_LONG * (nb)) / sizeof(long))
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+                                      const struct fetch_type *t,
+                                      struct fetch_param *f)
+{
+        struct bitfield_fetch_param *bprm;
+        unsigned long bw, bo;
+        char *tail;
+        if (*bf != 'b')
+                return 0;
+        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+        if (!bprm)
+                return -ENOMEM;
+        bprm->orig = *f;
+        f->fn = t->fetch[FETCH_MTD_bitfield];
+        f->data = (void *)bprm;
+        bw = simple_strtoul(bf + 1, &tail, 0);  /* Use simple one */
+        if (bw == 0 || *tail != '@')
+                return -EINVAL;
+        bf = tail + 1;
+        bo = simple_strtoul(bf, &tail, 0);
+        if (tail == bf || *tail != '/')
+                return -EINVAL;
+        bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+        bprm->low_shift = bprm->hi_shift + bo;
+        return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
 /* String length checking wrapper */
 static int parse_probe_arg(char *arg, struct trace_probe *tp,
                           struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
        parg->offset = tp->size;
        tp->size += parg->type->size;
        ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+        if (ret >= 0 && t != NULL)
+                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
        if (ret >= 0) {
                parg->fetch_size.fn = get_fetch_size_function(parg->type,
                                                              parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
        return ret;
 }
-#define WRITE_BUFSIZE 128
+#define WRITE_BUFSIZE 4096
 static ssize_t probes_write(struct file *file, const char __user *buffer,
                            size_t count, loff_t *ppos)
@@ -1738,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp)
        kfree(tp->call.print_fmt);
 }
-/* Make a debugfs interface for controling probe points */
+/* Make a debugfs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
        struct dentry *d_tracer;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 * @entry: The trace entry field from the ring buffer
 *
 * Prints the generic fields of irqs off, in hard or softirq, preempt
- * count and lock depth.
+ * count.
 */
 int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
-        int hardirq, softirq;
+        char hardsoft_irq;
+        char need_resched;
+        char irqs_off;
+        int hardirq;
+        int softirq;
        int ret;
        hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
        softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+        irqs_off =
+                (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+                (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+                '.';
+        need_resched =
+                (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+        hardsoft_irq =
+                (hardirq && softirq) ? 'H' :
+                hardirq ? 'h' :
+                softirq ? 's' :
+                '.';
        if (!trace_seq_printf(s, "%c%c%c",
-                              (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+                              irqs_off, need_resched, hardsoft_irq))
-                                (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
-                                  'X' : '.',
-                              (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
-                                'N' : '.',
-                              (hardirq && softirq) ? 'H' :
-                                hardirq ? 'h' : softirq ? 's' : '.'))
                return 0;
        if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
        else
                ret = trace_seq_putc(s, '.');
-        if (!ret)
+        return ret;
-                return 0;
-        if (entry->lock_depth < 0)
-                return trace_seq_putc(s, '.');
-        return trace_seq_printf(s, "%d", entry->lock_depth);
 }
 static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
        ctx_trace = tr;
 }
-static void stop_sched_trace(struct trace_array *tr)
-{
-        tracing_stop_sched_switch_record();
-}
-static int sched_switch_trace_init(struct trace_array *tr)
-{
-        ctx_trace = tr;
-        tracing_reset_online_cpus(tr);
-        tracing_start_sched_switch_record();
-        return 0;
-}
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
-        if (sched_ref)
-                stop_sched_trace(tr);
-}
-static void sched_switch_trace_start(struct trace_array *tr)
-{
-        sched_stopped = 0;
-}
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
-        sched_stopped = 1;
-}
-static struct tracer sched_switch_trace __read_mostly =
-{
-        .name           = "sched_switch",
-        .init           = sched_switch_trace_init,
-        .reset          = sched_switch_trace_reset,
-        .start          = sched_switch_trace_start,
-        .stop           = sched_switch_trace_stop,
-        .wait_pipe      = poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
-        .selftest    = trace_selftest_startup_sched_switch,
-#endif
-};
-__init static int init_sched_switch_trace(void)
-{
-        return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
 static struct syscall_metadata **syscalls_metadata;
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+        /*
+         * Only compare after the "sys" prefix. Archs that use
+         * syscall wrappers may have syscalls symbols aliases prefixed
+         * with "SyS" instead of "sys", leading to an unwanted
+         * mismatch.
+         */
+        return !strcmp(sym + 3, name + 3);
+}
+#endif
 static __init struct syscall_metadata *
 find_syscall_meta(unsigned long syscall)
 {
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
        stop = __stop_syscalls_metadata;
        kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+        if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+                return NULL;
        for ( ; start < stop; start++) {
-                /*
+                if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
-                 * Only compare after the "sys" prefix. Archs that use
-                 * syscall wrappers may have syscalls symbols aliases prefixed
-                 * with "SyS" instead of "sys", leading to an unwanted
-                 * mismatch.
-                 */
-                if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
                        return *start;
        }
        return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
        sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
+        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
        sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
+        int num;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
+        if (num < 0 || num >= NR_syscalls) {
+                pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+                                ((struct syscall_metadata *)call->data)->name);
+                return -ENOSYS;
+        }
        if (set_syscall_print_fmt(call) < 0)
                return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
        return id;
 }
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
 {
        return (unsigned long)sys_call_table[nr];
 }
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!capable(CAP_SETGID))
+        if (!nsown_capable(CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 /*
 * Removes a registered user return notifier.  Must be called from atomic
- * context, and from the same cpu registration occured in.
+ * context, and from the same cpu registration occurred in.
 */
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
 struct user_namespace init_user_ns = {
        .kref = {
-                .refcount       = ATOMIC_INIT(2),
+                .refcount       = ATOMIC_INIT(3),
        },
        .creator = &root_user,
 };
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
 */
 static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
        .__count        = ATOMIC_INIT(2),
        .processes      = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
 * @old_ns: namespace to clone
 * Return NULL on error (failure to kmalloc), new ns otherwise
 */
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+                                          struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
        up_read(&uts_sem);
        return ns;
 }
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 * utsname of this process won't be seen by parent, and vice
 * versa.
 */
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+                                   struct task_struct *tsk)
 {
+        struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
        struct uts_namespace *new_ns;
        BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
        if (!(flags & CLONE_NEWUTS))
                return old_ns;
-        new_ns = clone_uts_ns(old_ns);
+        new_ns = clone_uts_ns(tsk, old_ns);
        put_uts_ns(old_ns);
        return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
        struct uts_namespace *ns;
        ns = container_of(kref, struct uts_namespace, kref);
+        put_user_ns(ns->user_ns);
        kfree(ns);
 }
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
 * woken up through the queue.
 *
 * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and noone wakes up
+ * aborts and is woken up concurrently and no one wakes up
 * the next waiter.
 */
 void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..14733d4d156b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
 static int __init hardlockup_panic_setup(char *str)
 {
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
+        else if (!strncmp(str, "nopanic", 7))
+                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
                watchdog_enabled = 0;
        return 1;
@@ -415,19 +418,25 @@ static int watchdog_prepare_cpu(int cpu)
 static int watchdog_enable(int cpu)
 {
        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-        int err;
+        int err = 0;
        /* enable the perf event */
        err = watchdog_nmi_enable(cpu);
-        if (err)
-                return err;
+        /* Regardless of err above, fall through and start softlockup */
        /* create the watchdog thread */
        if (!p) {
                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                        return PTR_ERR(p);
+                        if (!err) {
+                                /* if hardlockup hasn't already set this */
+                                err = PTR_ERR(p);
+                                /* and disable the perf event */
+                                watchdog_nmi_disable(cpu);
+                        }
+                        goto out;
                }
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +444,8 @@ static int watchdog_enable(int cpu)
                wake_up_process(p);
        }
-        return 0;
+out:
+        return err;
 }
 static void watchdog_disable(int cpu)
@@ -547,7 +557,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
        }
-        return notifier_from_errno(err);
+        /*
+         * hardlockup and softlockup are not important enough
+         * to block cpu bring up.  Just always succeed and
+         * rely on printk output to flag problems.
+         */
+        return NOTIFY_OK;
 }
 static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578ad..e3378e8d3a5c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+EXPORT_SYMBOL_GPL(system_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -316,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
 static struct debug_obj_descr work_debug_descr;
+static void *work_debug_hint(void *addr)
+{
+        return ((struct work_struct *) addr)->func;
+}
 /*
 * fixup_init is called when:
 * - an active object is initialized
@@ -387,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr work_debug_descr = {
        .name           = "work_struct",
+        .debug_hint     = work_debug_hint,
        .fixup_init     = work_fixup_init,
        .fixup_activate = work_fixup_activate,
        .fixup_free     = work_fixup_free,
@@ -1283,8 +1291,14 @@ __acquires(&gcwq->lock)
                        return true;
                spin_unlock_irq(&gcwq->lock);
-                /* CPU has come up inbetween, retry migration */
+                /*
+                 * We've raced with CPU hot[un]plug.  Give it a breather
+                 * and retry migration.  cond_resched() is required here;
+                 * otherwise, we might deadlock against cpu_stop trying to
+                 * bring down the CPU on non-preemptive kernel.
+                 */
                cpu_relax();
+                cond_resched();
        }
 }
@@ -1358,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        worker->id = id;
        if (!on_unbound_cpu)
-                worker->task = kthread_create(worker_thread, worker,
+                worker->task = kthread_create_on_node(worker_thread,
-                                              "kworker/%u:%d", gcwq->cpu, id);
+                                                      worker,
+                                                      cpu_to_node(gcwq->cpu),
+                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
                                              "kworker/u:%d", id);
@@ -3775,8 +3791,10 @@ static int __init init_workqueues(void)
        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
+        system_freezable_wq = alloc_workqueue("events_freezable",
+                                              WQ_FREEZABLE, 0);
        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-               !system_unbound_wq);
+               !system_unbound_wq || !system_freezable_wq);
        return 0;
 }
 early_initcall(init_workqueues);
author	Thomas Gleixner <tglx@linutronix.de>	2011-05-14 06:06:36 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2011-05-14 06:06:36 -0400
commit	a18f22a968de17b29f2310cdb7ba69163e65ec15 (patch)
tree	a7d56d88fad5e444d7661484109758a2f436129e /kernel
parent	a1c57e0fec53defe745e64417eacdbd3618c3e66 (diff)
parent	798778b8653f64b7b2162ac70eca10367cff6ce8 (diff)