100 files changed, 3642 insertions, 2373 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6b..a6605ca921b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        do_div(elapsed, AHZ);
        ac.ac_btime = get_seconds() - elapsed;
        /* we really need to bite the bullet and change layout */
-        current_uid_gid(&ac.ac_uid, &ac.ac_gid);
+        ac.ac_uid = orig_cred->uid;
+        ac.ac_gid = orig_cred->gid;
 #if ACCT_VERSION==2
        ac.ac_ahz = AHZ;
 #endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f328..4b05bd9479d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,7 +277,7 @@ static void untag_chunk(struct node *p)
                owner->root = NULL;
        }
-        for (i = j = 0; i < size; i++, j++) {
+        for (i = j = 0; j <= size; i++, j++) {
                struct audit_tree *s;
                if (&chunk->owners[j] == p) {
                        list_del_init(&p->list);
@@ -290,7 +290,7 @@ static void untag_chunk(struct node *p)
                if (!s) /* result of earlier fallback */
                        continue;
                get_tree(s);
-                list_replace_init(&chunk->owners[i].list, &new->owners[j].list);
+                list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
        }
        list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        for (n = 0; n < old->count; n++) {
                if (old->owners[n].owner == tree) {
                        spin_unlock(&hash_lock);
-                        put_inotify_watch(watch);
+                        put_inotify_watch(&old->watch);
                        return 0;
                }
        }
        spin_unlock(&hash_lock);
        chunk = alloc_chunk(old->count + 1);
-        if (!chunk)
+        if (!chunk) {
+                put_inotify_watch(&old->watch);
                return -ENOMEM;
+        }
        mutex_lock(&inode->inotify_mutex);
        if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        spin_unlock(&hash_lock);
        inotify_evict_watch(&old->watch);
        mutex_unlock(&inode->inotify_mutex);
-        put_inotify_watch(&old->watch);
+        put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
+        put_inotify_watch(&old->watch); /* and kill it */
        return 0;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f019..fc0f928167e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -250,7 +250,6 @@ struct audit_context {
 #endif
 };
-#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
 static inline int open_arg(int flags, int mask)
 {
        int n = ACC_MODE(flags);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c530138183..98a51f26c13 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
 void foo(void)
 {
-        /* The enum constants to put into include/linux/bounds.h */
+        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
        /* End of constants */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5..aa3bee56644 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
                        /* make sure l doesn't vanish out from under us */
                        down_write(&l->mutex);
                        mutex_unlock(&cgrp->pidlist_mutex);
-                        l->use_count++;
                        return l;
                }
        }
@@ -2937,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
-                if (ss->use_id)
+                if (ss->use_id) {
-                        if (alloc_css_id(ss, parent, cgrp))
+                        err = alloc_css_id(ss, parent, cgrp);
+                        if (err)
                                goto err_destroy;
+                }
                /* At error, ->destroy() callback has to free assigned ID. */
        }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb21..677f25376a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
-                if (task_cpu(p) == cpu &&
+                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-                                (state = %ld, flags = %x) \n",
+                                "(state = %ld, flags = %x)\n",
-                                 p->comm, task_pid_nr(p), cpu,
+                                p->comm, task_pid_nr(p), cpu,
-                                 p->state, p->flags);
+                                p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
 }
@@ -209,9 +209,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                return -ENOMEM;
        cpu_hotplug_begin();
+        set_cpu_active(cpu, false);
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
                                        hcpu, -1, &nr_calls);
        if (err == NOTIFY_BAD) {
+                set_cpu_active(cpu, true);
                nr_calls--;
                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                          hcpu, nr_calls, NULL);
@@ -223,11 +226,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        /* Ensure that we are not runnable on dying cpu */
        cpumask_copy(old_allowed, &current->cpus_allowed);
-        set_cpus_allowed_ptr(current,
+        set_cpus_allowed_ptr(current, cpu_active_mask);
-                             cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
+                set_cpu_active(cpu, true);
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
@@ -278,23 +281,8 @@ int __ref cpu_down(unsigned int cpu)
                goto out;
        }
-        set_cpu_active(cpu, false);
-        /*
-         * Make sure the all cpus did the reschedule and are not
-         * using stale version of the cpu_active_mask.
-         * This is not strictly necessary becuase stop_machine()
-         * that we run down the line already provides the required
-         * synchronization. But it's really a side effect and we do not
-         * want to depend on the innards of the stop_machine here.
-         */
-        synchronize_sched();
        err = _cpu_down(cpu, 0);
-        if (cpu_online(cpu))
-                set_cpu_active(cpu, true);
 out:
        cpu_maps_update_done();
        stop_machine_destroy();
@@ -383,19 +371,20 @@ int disable_nonboot_cpus(void)
                return error;
        cpu_maps_update_begin();
        first_cpu = cpumask_first(cpu_online_mask);
-        /* We take down all of the non-boot CPUs in one shot to avoid races
+        /*
+         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == first_cpu)
                        continue;
                error = _cpu_down(cpu, 1);
-                if (!error) {
+                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
-                        printk("CPU%d is down\n", cpu);
+                else {
-                } else {
                        printk(KERN_ERR "Error taking CPU%d down: %d\n",
                                cpu, error);
                        break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472..ba401fab459 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 }
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
        *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (retval < 0)
                        return retval;
-                if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
+                if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
                        return -EINVAL;
        }
        retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                }
                /* Continue past cpusets with all cpus, mems online */
-                if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
+                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                            cpu_online_mask);
+                            cpu_active_mask);
                nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
                mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        switch (phase) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                break;
        default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        cgroup_lock();
        mutex_lock(&callback_mutex);
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
        scan_for_empty_cpusets(&top_cpuset);
        ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b..1ed8ca18790 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_KEYS
        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
        if (!new->tgcred) {
-                kfree(new);
+                kmem_cache_free(cred_jar, new);
                return NULL;
        }
        atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e..546774a31a6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p)
                detach_pid(p, PIDTYPE_SID);
                list_del_rcu(&p->tasks);
+                list_del_init(&p->sibling);
                __get_cpu_var(process_counts)--;
        }
        list_del_rcu(&p->thread_group);
-        list_del_init(&p->sibling);
 }
 /*
@@ -736,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 /*
 * Any that need to be release_task'd are put on the @dead list.
 */
-static void reparent_thread(struct task_struct *father, struct task_struct *p,
+static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
 {
-        if (p->pdeath_signal)
-                group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
        list_move_tail(&p->sibling, &p->real_parent->children);
        if (task_detached(p))
@@ -780,12 +777,18 @@ static void forget_original_parent(struct task_struct *father)
        reaper = find_new_reaper(father);
        list_for_each_entry_safe(p, n, &father->children, sibling) {
-                p->real_parent = reaper;
+                struct task_struct *t = p;
-                if (p->parent == father) {
+                do {
-                        BUG_ON(task_ptrace(p));
+                        t->real_parent = reaper;
-                        p->parent = p->real_parent;
+                        if (t->parent == father) {
-                }
+                                BUG_ON(task_ptrace(t));
-                reparent_thread(father, p, &dead_children);
+                                t->parent = t->real_parent;
+                        }
+                        if (t->pdeath_signal)
+                                group_send_sig_info(t->pdeath_signal,
+                                                    SEND_SIG_NOINFO, t);
+                } while_each_thread(p, t);
+                reparent_leader(father, p, &dead_children);
        }
        write_unlock_irq(&tasklist_lock);
@@ -933,7 +936,7 @@ NORET_TYPE void do_exit(long code)
         * an exiting task cleaning up the robust pi futexes.
         */
        smp_mb();
-        spin_unlock_wait(&tsk->pi_lock);
+        raw_spin_unlock_wait(&tsk->pi_lock);
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -971,7 +974,7 @@ NORET_TYPE void do_exit(long code)
        exit_thread();
        cgroup_exit(tsk, 1);
-        if (group_dead && tsk->signal->leader)
+        if (group_dead)
                disassociate_ctty(1);
        module_put(task_thread_info(tsk)->exec_domain->module);
@@ -1551,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
        struct task_struct *p;
        list_for_each_entry(p, &tsk->children, sibling) {
-                /*
+                int ret = wait_consider_task(wo, 0, p);
-                 * Do not consider detached threads.
+                if (ret)
-                 */
+                        return ret;
-                if (!task_detached(p)) {
-                        int ret = wait_consider_task(wo, 0, p);
-                        if (ret)
-                                return ret;
-                }
        }
        return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1415dc4598a..f88bd984df3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -939,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 static void rt_mutex_init_task(struct task_struct *p)
 {
-        spin_lock_init(&p->pi_lock);
+        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&p->pi_waiters, &p->pi_lock);
+        plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+        p->memcg_batch.do_batch = 0;
+        p->memcg_batch.memcg = NULL;
+#endif
        p->bts = NULL;
@@ -1206,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                p->sas_ss_sp = p->sas_ss_size = 0;
        /*
-         * Syscall tracing should be turned off in the child regardless
+         * Syscall tracing and stepping should be turned off in the
-         * of CLONE_PTRACE.
+         * child regardless of CLONE_PTRACE.
         */
+        user_disable_single_step(p);
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
 #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1236,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
-        /*
-         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * not be changed, nor will its assigned CPU.
-         *
-         * The cpus_allowed mask of the parent may have changed after it was
-         * copied first time - so re-copy it here, then check the child's CPU
-         * to ensure it is on a valid CPU (and if not, just force it back to
-         * parent's CPU). This avoids alot of nasty races.
-         */
-        p->cpus_allowed = current->cpus_allowed;
-        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-                        !cpu_online(task_cpu(p))))
-                set_task_cpu(p, smp_processor_id());
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
@@ -1286,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
        if (likely(p->pid)) {
-                list_add_tail(&p->sibling, &p->real_parent->children);
                tracehook_finish_clone(p, clone_flags, trace);
                if (thread_group_leader(p)) {
@@ -1298,6 +1287,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        attach_pid(p, PIDTYPE_SID, task_session(current));
+                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
                }
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc4..e7a35f1039e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
 * @uaddr:      virtual address of the futex
 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:        address where result is stored.
- * @rw:         mapping needs to be read/write (values: VERIFY_READ,
- *              VERIFY_WRITE)
 *
 * Returns a negative error code or 0
 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
 * lock_page() might sleep, the caller should not hold a spinlock.
 */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-                if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
        }
 again:
-        err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
+        err = get_user_pages_fast(address, 1, 1, &page);
        if (err < 0)
                return err;
@@ -304,8 +302,14 @@ void put_futex_key(int fshared, union futex_key *key)
 */
 static int fault_in_user_writeable(u32 __user *uaddr)
 {
-        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+        struct mm_struct *mm = current->mm;
-                                 1, 1, 0, NULL, NULL);
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, (unsigned long)uaddr,
+                             1, 1, 0, NULL, NULL);
+        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
 }
@@ -397,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
         * and has cleaned up the pi_state already
         */
        if (pi_state->owner) {
-                spin_lock_irq(&pi_state->owner->pi_lock);
+                raw_spin_lock_irq(&pi_state->owner->pi_lock);
                list_del_init(&pi_state->list);
-                spin_unlock_irq(&pi_state->owner->pi_lock);
+                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
        }
@@ -464,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr)
         * pi_state_list anymore, but we have to be careful
         * versus waiters unqueueing themselves:
         */
-        spin_lock_irq(&curr->pi_lock);
+        raw_spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
                hb = hash_futex(&key);
-                spin_unlock_irq(&curr->pi_lock);
+                raw_spin_unlock_irq(&curr->pi_lock);
                spin_lock(&hb->lock);
-                spin_lock_irq(&curr->pi_lock);
+                raw_spin_lock_irq(&curr->pi_lock);
                /*
                 * We dropped the pi-lock, so re-check whether this
                 * task still owns the PI-state:
@@ -489,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr)
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
                pi_state->owner = NULL;
-                spin_unlock_irq(&curr->pi_lock);
+                raw_spin_unlock_irq(&curr->pi_lock);
                rt_mutex_unlock(&pi_state->pi_mutex);
                spin_unlock(&hb->lock);
-                spin_lock_irq(&curr->pi_lock);
+                raw_spin_lock_irq(&curr->pi_lock);
        }
-        spin_unlock_irq(&curr->pi_lock);
+        raw_spin_unlock_irq(&curr->pi_lock);
 }
 static int
@@ -526,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                return -EINVAL;
                        WARN_ON(!atomic_read(&pi_state->refcount));
-                        WARN_ON(pid && pi_state->owner &&
-                                pi_state->owner->pid != pid);
+                        /*
+                         * When pi_state->owner is NULL then the owner died
+                         * and another waiter is on the fly. pi_state->owner
+                         * is fixed up by the task which acquires
+                         * pi_state->rt_mutex.
+                         *
+                         * We do not check for pid == 0 which can happen when
+                         * the owner died and robust_list_exit() cleared the
+                         * TID.
+                         */
+                        if (pid && pi_state->owner) {
+                                /*
+                                 * Bail out if user space manipulated the
+                                 * futex value.
+                                 */
+                                if (pid != task_pid_vnr(pi_state->owner))
+                                        return -EINVAL;
+                        }
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -552,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
         * change of the task flags, we do this protected by
         * p->pi_lock:
         */
-        spin_lock_irq(&p->pi_lock);
+        raw_spin_lock_irq(&p->pi_lock);
        if (unlikely(p->flags & PF_EXITING)) {
                /*
                 * The task is on the way out. When PF_EXITPIDONE is
@@ -561,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                 */
                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
-                spin_unlock_irq(&p->pi_lock);
+                raw_spin_unlock_irq(&p->pi_lock);
                put_task_struct(p);
                return ret;
        }
@@ -580,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
-        spin_unlock_irq(&p->pi_lock);
+        raw_spin_unlock_irq(&p->pi_lock);
        put_task_struct(p);
@@ -754,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
-        spin_lock(&pi_state->pi_mutex.wait_lock);
+        /*
+         * If current does not own the pi_state then the futex is
+         * inconsistent and user space fiddled with the futex value.
+         */
+        if (pi_state->owner != current)
+                return -EINVAL;
+        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
        /*
@@ -783,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                else if (curval != uval)
                        ret = -EINVAL;
                if (ret) {
-                        spin_unlock(&pi_state->pi_mutex.wait_lock);
+                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
                        return ret;
                }
        }
-        spin_lock_irq(&pi_state->owner->pi_lock);
+        raw_spin_lock_irq(&pi_state->owner->pi_lock);
        WARN_ON(list_empty(&pi_state->list));
        list_del_init(&pi_state->list);
-        spin_unlock_irq(&pi_state->owner->pi_lock);
+        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-        spin_lock_irq(&new_owner->pi_lock);
+        raw_spin_lock_irq(&new_owner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &new_owner->pi_state_list);
        pi_state->owner = new_owner;
-        spin_unlock_irq(&new_owner->pi_lock);
+        raw_spin_unlock_irq(&new_owner->pi_lock);
-        spin_unlock(&pi_state->pi_mutex.wait_lock);
+        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
        rt_mutex_unlock(&pi_state->pi_mutex);
        return 0;
@@ -861,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
+        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -907,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1004,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
                plist_add(&q->list, &hb2->chain);
                q->lock_ptr = &hb2->lock;
 #ifdef CONFIG_DEBUG_PI_LIST
-                q->list.plist.lock = &hb2->lock;
+                q->list.plist.spinlock = &hb2->lock;
 #endif
        }
        get_futex_key_refs(key2);
@@ -1040,7 +1068,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
        q->lock_ptr = &hb->lock;
 #ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.lock = &hb->lock;
+        q->list.plist.spinlock = &hb->lock;
 #endif
        wake_up_state(q->task, TASK_NORMAL);
@@ -1169,11 +1197,10 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2,
+        ret = get_futex_key(uaddr2, fshared, &key2);
-                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1388,7 +1415,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        plist_node_init(&q->list, prio);
 #ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.lock = &hb->lock;
+        q->list.plist.spinlock = &hb->lock;
 #endif
        plist_add(&q->list, &hb->chain);
        q->task = current;
@@ -1523,18 +1550,18 @@ retry:
         * itself.
         */
        if (pi_state->owner != NULL) {
-                spin_lock_irq(&pi_state->owner->pi_lock);
+                raw_spin_lock_irq(&pi_state->owner->pi_lock);
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
-                spin_unlock_irq(&pi_state->owner->pi_lock);
+                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
        }
        pi_state->owner = newowner;
-        spin_lock_irq(&newowner->pi_lock);
+        raw_spin_lock_irq(&newowner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &newowner->pi_state_list);
-        spin_unlock_irq(&newowner->pi_lock);
+        raw_spin_unlock_irq(&newowner->pi_lock);
        return 0;
        /*
@@ -1732,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         */
 retry:
        q->key = FUTEX_KEY_INIT;
-        ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+        ret = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(ret != 0))
                return ret;
@@ -1898,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
        q.requeue_pi_key = NULL;
 retry:
        q.key = FUTEX_KEY_INIT;
-        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
+        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out;
@@ -1968,7 +1995,7 @@ retry_private:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        goto out;
+        goto out_put_key;
 out_unlock_put_key:
        queue_unlock(&q, hb);
@@ -2017,7 +2044,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-        ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
+        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -2209,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        rt_waiter.task = NULL;
        key2 = FUTEX_KEY_INIT;
-        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998..0086628b6e9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
        for (;;) {
                base = timer->base;
                if (likely(base != NULL)) {
-                        spin_lock_irqsave(&base->cpu_base->lock, *flags);
+                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
-                        spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
+                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
@@ -208,13 +208,13 @@ again:
                /* See the comment in lock_timer_base() */
                timer->base = NULL;
-                spin_unlock(&base->cpu_base->lock);
+                raw_spin_unlock(&base->cpu_base->lock);
-                spin_lock(&new_base->cpu_base->lock);
+                raw_spin_lock(&new_base->cpu_base->lock);
                if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
                        cpu = this_cpu;
-                        spin_unlock(&new_base->cpu_base->lock);
+                        raw_spin_unlock(&new_base->cpu_base->lock);
-                        spin_lock(&base->cpu_base->lock);
+                        raw_spin_lock(&base->cpu_base->lock);
                        timer->base = base;
                        goto again;
                }
@@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
        struct hrtimer_clock_base *base = timer->base;
-        spin_lock_irqsave(&base->cpu_base->lock, *flags);
+        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
        return base;
 }
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 static int hrtimer_reprogram(struct hrtimer *timer,
                             struct hrtimer_clock_base *base)
 {
-        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        if (expires.tv64 < 0)
                return -ETIME;
-        if (expires.tv64 >= expires_next->tv64)
+        if (expires.tv64 >= cpu_base->expires_next.tv64)
+                return 0;
+        /*
+         * If a hang was detected in the last timer interrupt then we
+         * do not schedule a timer which is earlier than the expiry
+         * which we enforced in the hang detection. We want the system
+         * to make progress.
+         */
+        if (cpu_base->hang_detected)
                return 0;
        /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
         */
        res = tick_program_event(expires, 0);
        if (!IS_ERR_VALUE(res))
-                *expires_next = expires;
+                cpu_base->expires_next = expires;
        return res;
 }
@@ -619,12 +628,12 @@ static void retrigger_next_event(void *arg)
        base = &__get_cpu_var(hrtimer_bases);
        /* Adjust CLOCK_REALTIME offset */
-        spin_lock(&base->lock);
+        raw_spin_lock(&base->lock);
        base->clock_base[CLOCK_REALTIME].offset =
                timespec_to_ktime(realtime_offset);
        hrtimer_force_reprogram(base, 0);
-        spin_unlock(&base->lock);
+        raw_spin_unlock(&base->lock);
 }
 /*
@@ -685,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 {
        if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
                if (wakeup) {
-                        spin_unlock(&base->cpu_base->lock);
+                        raw_spin_unlock(&base->cpu_base->lock);
                        raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                        spin_lock(&base->cpu_base->lock);
+                        raw_spin_lock(&base->cpu_base->lock);
                } else
                        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
-#ifdef CONFIG_TIMER_STATS
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
 {
+#ifdef CONFIG_TIMER_STATS
        if (timer->start_site)
                return;
+        timer->start_site = __builtin_return_address(0);
-        timer->start_site = addr;
        memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
        timer->start_pid = current->pid;
+#endif
 }
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        timer->start_site = NULL;
+#endif
+}
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        if (likely(!timer_stats_active))
+                return;
+        timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+                                 timer->function, timer->start_comm, 0);
 #endif
+}
 /*
 * Counterpart to lock_hrtimer_base above:
@@ -765,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
 static inline
 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
-        spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
+        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
 }
 /**
@@ -1098,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
        unsigned long flags;
        int i;
-        spin_lock_irqsave(&cpu_base->lock, flags);
+        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        if (!hrtimer_hres_active()) {
                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1115,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void)
                }
        }
-        spin_unlock_irqrestore(&cpu_base->lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        if (mindelta.tv64 < 0)
                mindelta.tv64 = 0;
@@ -1197,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
         * they get migrated to another cpu, therefore its safe to unlock
         * the timer base.
         */
-        spin_unlock(&cpu_base->lock);
+        raw_spin_unlock(&cpu_base->lock);
        trace_hrtimer_expire_entry(timer, now);
        restart = fn(timer);
        trace_hrtimer_expire_exit(timer);
-        spin_lock(&cpu_base->lock);
+        raw_spin_lock(&cpu_base->lock);
        /*
         * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 #ifdef CONFIG_HIGH_RES_TIMERS
-static int force_clock_reprogram;
-/*
- * After 5 iteration's attempts, we consider that hrtimer_interrupt()
- * is hanging, which could happen with something that slows the interrupt
- * such as the tracing. Then we force the clock reprogramming for each future
- * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
- * threshold that we will overwrite.
- * The next tick event will be scheduled to 3 times we currently spend on
- * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
- * 1/4 of their time to process the hrtimer interrupts. This is enough to
- * let it running without serious starvation.
- */
-static inline void
-hrtimer_interrupt_hanging(struct clock_event_device *dev,
-                        ktime_t try_time)
-{
-        force_clock_reprogram = 1;
-        dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
-        printk(KERN_WARNING "hrtimer: interrupt too slow, "
-                "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
-}
 /*
 * High resolution timer interrupt
 * Called with interrupts disabled
@@ -1248,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
-        ktime_t expires_next, now;
+        ktime_t expires_next, now, entry_time, delta;
-        int nr_retries = 0;
+        int i, retries = 0;
-        int i;
        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
- retry:
+        entry_time = now = ktime_get();
-        /* 5 retries is enough to notice a hang */
+retry:
-        if (!(++nr_retries % 5))
-                hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
-        now = ktime_get();
        expires_next.tv64 = KTIME_MAX;
-        spin_lock(&cpu_base->lock);
+        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1321,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
         * against it.
         */
        cpu_base->expires_next = expires_next;
-        spin_unlock(&cpu_base->lock);
+        raw_spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
-        if (expires_next.tv64 != KTIME_MAX) {
+        if (expires_next.tv64 == KTIME_MAX ||
-                if (tick_program_event(expires_next, force_clock_reprogram))
+            !tick_program_event(expires_next, 0)) {
-                        goto retry;
+                cpu_base->hang_detected = 0;
+                return;
        }
+        /*
+         * The next timer was already expired due to:
+         * - tracing
+         * - long lasting callbacks
+         * - being scheduled away when running in a VM
+         *
+         * We need to prevent that we loop forever in the hrtimer
+         * interrupt routine. We give it 3 attempts to avoid
+         * overreacting on some spurious event.
+         */
+        now = ktime_get();
+        cpu_base->nr_retries++;
+        if (++retries < 3)
+                goto retry;
+        /*
+         * Give the system a chance to do something else than looping
+         * here. We stored the entry time, so we know exactly how long
+         * we spent here. We schedule the next event this amount of
+         * time away.
+         */
+        cpu_base->nr_hangs++;
+        cpu_base->hang_detected = 1;
+        delta = ktime_sub(now, entry_time);
+        if (delta.tv64 > cpu_base->max_hang_time.tv64)
+                cpu_base->max_hang_time = delta;
+        /*
+         * Limit it to a sensible value as we enforce a longer
+         * delay. Give the CPU at least 100ms to catch up.
+         */
+        if (delta.tv64 > 100 * NSEC_PER_MSEC)
+                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
+        else
+                expires_next = ktime_add(now, delta);
+        tick_program_event(expires_next, 1);
+        printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
+                    ktime_to_ns(delta));
 }
 /*
@@ -1423,7 +1457,7 @@ void hrtimer_run_queues(void)
                        gettime = 0;
                }
-                spin_lock(&cpu_base->lock);
+                raw_spin_lock(&cpu_base->lock);
                while ((node = base->first)) {
                        struct hrtimer *timer;
@@ -1435,7 +1469,7 @@ void hrtimer_run_queues(void)
                        __run_hrtimer(timer, &base->softirq_time);
                }
-                spin_unlock(&cpu_base->lock);
+                raw_spin_unlock(&cpu_base->lock);
        }
 }
@@ -1591,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
-        spin_lock_init(&cpu_base->lock);
+        raw_spin_lock_init(&cpu_base->lock);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
                cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1649,16 +1683,16 @@ static void migrate_hrtimers(int scpu)
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
-        spin_lock(&new_base->lock);
+        raw_spin_lock(&new_base->lock);
-        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+        raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }
-        spin_unlock(&old_base->lock);
+        raw_spin_unlock(&old_base->lock);
-        spin_unlock(&new_base->lock);
+        raw_spin_unlock(&new_base->lock);
        /* Check, if we got expired work to do */
        __hrtimer_peek_ahead_timers();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index cf5ee162841..967e66143e1 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,6 +40,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/hw_breakpoint.h>
@@ -52,7 +53,7 @@
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
@@ -73,7 +74,7 @@ static DEFINE_MUTEX(nr_bp_mutex);
 static unsigned int max_task_bp_pinned(int cpu)
 {
        int i;
-        unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
        for (i = HBP_NUM -1; i >= 0; i--) {
                if (tsk_pinned[i] > 0)
@@ -83,15 +84,51 @@ static unsigned int max_task_bp_pinned(int cpu)
        return 0;
 }
+static int task_bp_pinned(struct task_struct *tsk)
+{
+        struct perf_event_context *ctx = tsk->perf_event_ctxp;
+        struct list_head *list;
+        struct perf_event *bp;
+        unsigned long flags;
+        int count = 0;
+        if (WARN_ONCE(!ctx, "No perf context for this task"))
+                return 0;
+        list = &ctx->event_list;
+        raw_spin_lock_irqsave(&ctx->lock, flags);
+        /*
+         * The current breakpoint counter is not included in the list
+         * at the open() callback time
+         */
+        list_for_each_entry(bp, list, event_entry) {
+                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                        count++;
+        }
+        raw_spin_unlock_irqrestore(&ctx->lock, flags);
+        return count;
+}
 /*
 * Report the number of pinned/un-pinned breakpoints we have in
 * a given cpu (cpu > -1) or in all of them (cpu = -1).
 */
-static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 {
+        int cpu = bp->cpu;
+        struct task_struct *tsk = bp->ctx->task;
        if (cpu >= 0) {
                slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
-                slots->pinned += max_task_bp_pinned(cpu);
+                if (!tsk)
+                        slots->pinned += max_task_bp_pinned(cpu);
+                else
+                        slots->pinned += task_bp_pinned(tsk);
                slots->flexible = per_cpu(nr_bp_flexible, cpu);
                return;
@@ -101,7 +138,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
                unsigned int nr;
                nr = per_cpu(nr_cpu_bp_pinned, cpu);
-                nr += max_task_bp_pinned(cpu);
+                if (!tsk)
+                        nr += max_task_bp_pinned(cpu);
+                else
+                        nr += task_bp_pinned(tsk);
                if (nr > slots->pinned)
                        slots->pinned = nr;
@@ -118,35 +158,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
 */
 static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 {
-        int count = 0;
-        struct perf_event *bp;
-        struct perf_event_context *ctx = tsk->perf_event_ctxp;
        unsigned int *tsk_pinned;
-        struct list_head *list;
+        int count = 0;
-        unsigned long flags;
-        if (WARN_ONCE(!ctx, "No perf context for this task"))
-                return;
-        list = &ctx->event_list;
-        spin_lock_irqsave(&ctx->lock, flags);
-        /*
-         * The current breakpoint counter is not included in the list
-         * at the open() callback time
-         */
-        list_for_each_entry(bp, list, event_entry) {
-                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-                        count++;
-        }
-        spin_unlock_irqrestore(&ctx->lock, flags);
+        count = task_bp_pinned(tsk);
-        if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+        tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
-                return;
-        tsk_pinned = per_cpu(task_bp_pinned, cpu);
        if (enable) {
                tsk_pinned[count]++;
                if (count > 0)
@@ -193,7 +210,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to a single cpu, check:
 *
 *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
- *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
 *
 *       -> If there are already non-pinned counters in this cpu, it means
 *          there is already a free slot for them.
@@ -204,7 +221,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to every cpus, check:
 *
 *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
- *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
 *
 *       -> This is roughly the same, except we check the number of per cpu
 *          bp for every cpu and we keep the max one. Same for the per tasks
@@ -216,7 +233,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to a single cpu, check:
 *
 *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
- *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
 *
 *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
 *          one register at least (or they will never be fed).
@@ -224,42 +241,74 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to every cpus, check:
 *
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
- *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
 {
        struct bp_busy_slots slots = {0};
-        int ret = 0;
-        mutex_lock(&nr_bp_mutex);
-        fetch_bp_busy_slots(&slots, bp->cpu);
+        fetch_bp_busy_slots(&slots, bp);
        /* Flexible counters need to keep at least one slot */
-        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+        if (slots.pinned + (!!slots.flexible) == HBP_NUM)
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto end;
-        }
        toggle_bp_slot(bp, true);
-end:
+        return 0;
+}
+int reserve_bp_slot(struct perf_event *bp)
+{
+        int ret;
+        mutex_lock(&nr_bp_mutex);
+        ret = __reserve_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
        return ret;
 }
+static void __release_bp_slot(struct perf_event *bp)
+{
+        toggle_bp_slot(bp, false);
+}
 void release_bp_slot(struct perf_event *bp)
 {
        mutex_lock(&nr_bp_mutex);
-        toggle_bp_slot(bp, false);
+        __release_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
 }
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        return __reserve_bp_slot(bp);
+}
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        __release_bp_slot(bp);
-int __register_perf_hw_breakpoint(struct perf_event *bp)
+        return 0;
+}
+int register_perf_hw_breakpoint(struct perf_event *bp)
 {
        int ret;
@@ -276,17 +325,14 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
         * This is a quick hack that will be removed soon, once we remove
         * the tmp breakpoints from ptrace
         */
-        if (!bp->attr.disabled || bp->callback == perf_bp_event)
+        if (!bp->attr.disabled || !bp->overflow_handler)
                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
-        return ret;
+        /* if arch_validate_hwbkpt_settings() fails then release bp slot */
-}
+        if (ret)
+                release_bp_slot(bp);
-int register_perf_hw_breakpoint(struct perf_event *bp)
-{
-        bp->callback = perf_bp_event;
-        return __register_perf_hw_breakpoint(bp);
+        return ret;
 }
 /**
@@ -297,7 +343,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 */
 struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_callback_t triggered,
+                            perf_overflow_handler_t triggered,
                            struct task_struct *tsk)
 {
        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
@@ -311,19 +357,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 * @triggered: callback to trigger when we hit the breakpoint
 * @tsk: pointer to 'task_struct' of the process to which the address belongs
 */
-struct perf_event *
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
-modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
-                          perf_callback_t triggered,
-                          struct task_struct *tsk)
 {
-        /*
+        u64 old_addr = bp->attr.bp_addr;
-         * FIXME: do it without unregistering
+        u64 old_len = bp->attr.bp_len;
-         * - We don't want to lose our slot
+        int old_type = bp->attr.bp_type;
-         * - If the new bp is incorrect, don't lose the older one
+        int err = 0;
-         */
-        unregister_hw_breakpoint(bp);
-        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+        perf_event_disable(bp);
+        bp->attr.bp_addr = attr->bp_addr;
+        bp->attr.bp_type = attr->bp_type;
+        bp->attr.bp_len = attr->bp_len;
+        if (attr->disabled)
+                goto end;
+        err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        if (!err)
+                perf_event_enable(bp);
+        if (err) {
+                bp->attr.bp_addr = old_addr;
+                bp->attr.bp_type = old_type;
+                bp->attr.bp_len = old_len;
+                if (!bp->attr.disabled)
+                        perf_event_enable(bp);
+                return err;
+        }
+end:
+        bp->attr.disabled = attr->disabled;
+        return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
@@ -348,7 +415,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 */
 struct perf_event **
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_callback_t triggered)
+                            perf_overflow_handler_t triggered)
 {
        struct perf_event **cpu_events, **pevent, *bp;
        long err;
@@ -358,7 +425,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        if (!cpu_events)
                return ERR_PTR(-ENOMEM);
-        for_each_possible_cpu(cpu) {
+        get_online_cpus();
+        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
                bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
@@ -369,18 +437,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
                        goto fail;
                }
        }
+        put_online_cpus();
        return cpu_events;
 fail:
-        for_each_possible_cpu(cpu) {
+        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
                if (IS_ERR(*pevent))
                        break;
                unregister_hw_breakpoint(*pevent);
        }
+        put_online_cpus();
        free_percpu(cpu_events);
-        /* return the error if any */
        return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416..2295a31ef11 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
         * flush such a longstanding irq before considering it as spurious.
         */
        for_each_irq_desc_reverse(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        /*
                         * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
                                desc->chip->set_type(i, IRQ_TYPE_PROBE);
                        desc->chip->startup(i);
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
         * happened in the previous stage, it may have masked itself)
         */
        for_each_irq_desc_reverse(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
                        if (desc->chip->startup(i))
                                desc->status |= IRQ_PENDING;
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
         * Now filter out any obviously spurious interrupts
         */
        for_each_irq_desc(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                status = desc->status;
                if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
                                if (i < 32)
                                        mask |= 1 << i;
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
        int i;
        for_each_irq_desc(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                status = desc->status;
                if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
                        desc->status = status & ~IRQ_AUTODETECT;
                        desc->chip->shutdown(i);
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        mutex_unlock(&probing_active);
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
        unsigned int status;
        for_each_irq_desc(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                status = desc->status;
                if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
                        desc->status = status & ~IRQ_AUTODETECT;
                        desc->chip->shutdown(i);
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        mutex_unlock(&probing_active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ba566c261ad..ecc3fa28f66 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq)
        }
        /* Ensure we don't have left over values from a previous use of this irq */
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->status = IRQ_DISABLED;
        desc->chip = &no_irq_chip;
        desc->handle_irq = handle_bad_irq;
@@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq)
        cpumask_clear(desc->pending_mask);
 #endif
 #endif
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 /**
@@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq)
                return;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        if (desc->action) {
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
                        irq);
                return;
@@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        desc->chip = &no_irq_chip;
        desc->name = NULL;
        clear_kstat_irqs(desc);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
@@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        if (!chip)
                chip = &no_irq_chip;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        irq_chip_set_defaults(chip);
        desc->chip = chip;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
        if (type == IRQ_TYPE_NONE)
                return 0;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        ret = __irq_set_trigger(desc, irq, type);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(set_irq_type);
@@ -158,9 +158,9 @@ int set_irq_data(unsigned int irq, void *data)
                return -EINVAL;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->handler_data = data;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
 EXPORT_SYMBOL(set_irq_data);
@@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
                return -EINVAL;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->msi_desc = entry;
        if (entry)
                entry->irq = irq;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
                return -EINVAL;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->chip_data = data;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
        if (!desc)
                return;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        if (nest)
                desc->status |= IRQ_NESTED_THREAD;
        else
                desc->status &= ~IRQ_NESTED_THREAD;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL_GPL(set_irq_nested_thread);
@@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq)
        might_sleep();
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq)
                goto out_unlock;
        desc->status |= IRQ_INPROGRESS;
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
        action_ret = action->thread_fn(action->irq, action->dev_id);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_nested_irq);
@@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        struct irqaction *action;
        irqreturn_t action_ret;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        desc->status |= IRQ_INPROGRESS;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        action_ret = handle_IRQ_event(irq, action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 /**
@@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        struct irqaction *action;
        irqreturn_t action_ret;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
        if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        desc->status |= IRQ_INPROGRESS;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        action_ret = handle_IRQ_event(irq, action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
        if (unlikely(desc->status & IRQ_ONESHOT))
@@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
                desc->chip->unmask(irq);
 out_unlock:
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
@@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        struct irqaction *action;
        irqreturn_t action_ret;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out;
@@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        desc->status |= IRQ_INPROGRESS;
        desc->status &= ~IRQ_PENDING;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        action_ret = handle_IRQ_event(irq, action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 /**
@@ -530,7 +530,7 @@ out:
 void
 handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -576,17 +576,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                }
                desc->status &= ~IRQ_PENDING;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                action_ret = handle_IRQ_event(irq, action);
                if (!noirqdebug)
                        note_interrupt(irq, desc, action_ret);
-                spin_lock(&desc->lock);
+                raw_spin_lock(&desc->lock);
        } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 /**
@@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        }
        chip_bus_lock(irq, desc);
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        /* Uninstall? */
        if (handle == handle_bad_irq) {
@@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                desc->depth = 0;
                desc->chip->startup(irq);
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq)
                return;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->status |= IRQ_NOPROBE;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 void __init set_irq_probe(unsigned int irq)
@@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq)
                return;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->status &= ~IRQ_NOPROBE;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c..814940e7f48 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = {
        .chip       = &no_irq_chip,
        .handle_irq = handle_bad_irq,
        .depth      = 1,
-        .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+        .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
@@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-        spin_lock_init(&desc->lock);
+        raw_spin_lock_init(&desc->lock);
        desc->irq = irq;
 #ifdef CONFIG_SMP
        desc->node = node;
@@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 /*
 * Protect the sparse_irqs:
 */
-DEFINE_SPINLOCK(sparse_irq_lock);
+DEFINE_RAW_SPINLOCK(sparse_irq_lock);
 struct irq_desc **irq_desc_ptrs __read_mostly;
@@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
                .chip       = &no_irq_chip,
                .handle_irq = handle_bad_irq,
                .depth      = 1,
-                .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+                .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
        }
 };
@@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
        if (desc)
                return desc;
-        spin_lock_irqsave(&sparse_irq_lock, flags);
+        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
        desc = irq_desc_ptrs[irq];
@@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
        irq_desc_ptrs[irq] = desc;
 out_unlock:
-        spin_unlock_irqrestore(&sparse_irq_lock, flags);
+        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        return desc;
 }
@@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
                .chip = &no_irq_chip,
                .handle_irq = handle_bad_irq,
                .depth = 1,
-                .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
+                .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
        }
 };
@@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq)
                return 1;
        }
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        if (desc->chip->ack)
                desc->chip->ack(irq);
        /*
@@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq)
        for (;;) {
                irqreturn_t action_ret;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                action_ret = handle_IRQ_event(irq, action);
                if (!noirqdebug)
                        note_interrupt(irq, desc, action_ret);
-                spin_lock(&desc->lock);
+                raw_spin_lock(&desc->lock);
                if (likely(!(desc->status & IRQ_PENDING)))
                        break;
                desc->status &= ~IRQ_PENDING;
@@ -536,7 +536,7 @@ out:
         * disabled while the handler was running.
         */
        desc->chip->end(irq);
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        return 1;
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a7..b2821f070a3 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
-extern spinlock_t sparse_irq_lock;
+extern raw_spinlock_t sparse_irq_lock;
 #ifdef CONFIG_SPARSE_IRQ
 /* irq_desc_ptrs allocated at boot time */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24..eb6078ca60c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
                        cpu_relax();
                /* Ok, that indicated we're done: double-check carefully. */
-                spin_lock_irqsave(&desc->lock, flags);
+                raw_spin_lock_irqsave(&desc->lock, flags);
                status = desc->status;
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
                /* Oops, that failed? */
        } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        if (!desc->chip->set_affinity)
                return -EINVAL;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        }
 #endif
        desc->status |= IRQ_AFFINITY_SET;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
        unsigned long flags;
        int ret;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        ret = setup_affinity(irq, desc);
        if (!ret)
                irq_set_thread_affinity(desc);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
                return;
        chip_bus_lock(irq, desc);
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        __disable_irq(desc, irq, false);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
                return;
        chip_bus_lock(irq, desc);
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        if (on) {
                if (desc->wake_depth++ == 0) {
                        ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
                }
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(set_irq_wake);
@@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
        chip_bus_lock(irq, desc);
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
                desc->chip->unmask(irq);
        }
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
        chip_bus_sync_unlock(irq, desc);
 }
@@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
                return;
        }
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        cpumask_copy(mask, desc->affinity);
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
        set_cpus_allowed_ptr(current, mask);
        free_cpumask_var(mask);
@@ -545,7 +545,7 @@ static int irq_thread(void *data)
                atomic_inc(&desc->threads_active);
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                if (unlikely(desc->status & IRQ_DISABLED)) {
                        /*
                         * CHECKME: We might need a dedicated
@@ -555,9 +555,9 @@ static int irq_thread(void *data)
                         * retriggers the interrupt itself --- tglx
                         */
                        desc->status |= IRQ_PENDING;
-                        spin_unlock_irq(&desc->lock);
+                        raw_spin_unlock_irq(&desc->lock);
                } else {
-                        spin_unlock_irq(&desc->lock);
+                        raw_spin_unlock_irq(&desc->lock);
                        action->thread_fn(action->irq, action->dev_id);
@@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        /*
         * The following block of code has to be executed atomically
         */
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        old_ptr = &desc->action;
        old = *old_ptr;
        if (old) {
@@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                __enable_irq(desc, irq, false);
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        /*
         * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +802,7 @@ mismatch:
        ret = -EBUSY;
 out_thread:
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        if (new->thread) {
                struct task_struct *t = new->thread;
@@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        if (!desc)
                return NULL;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        /*
         * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                if (!action) {
                        WARN(1, "Trying to free already-free IRQ %d\n", irq);
-                        spin_unlock_irqrestore(&desc->lock, flags);
+                        raw_spin_unlock_irqrestore(&desc->lock, flags);
                        return NULL;
                }
@@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                        desc->chip->disable(irq);
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        unregister_handler_proc(irq, action);
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
                kfree(action);
 #ifdef CONFIG_DEBUG_SHIRQ
-        if (irqflags & IRQF_SHARED) {
+        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
                 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f262..24196228083 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
        if (!desc->chip->set_affinity)
                return;
-        assert_spin_locked(&desc->lock);
+        assert_raw_spin_locked(&desc->lock);
        /*
         * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2..26bac9d8f86 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
                                "for migration.\n", irq);
                return false;
        }
-        spin_lock_init(&desc->lock);
+        raw_spin_lock_init(&desc->lock);
        desc->node = node;
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        irq = old_desc->irq;
-        spin_lock_irqsave(&sparse_irq_lock, flags);
+        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
        desc = irq_desc_ptrs[irq];
@@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        }
        irq_desc_ptrs[irq] = desc;
-        spin_unlock_irqrestore(&sparse_irq_lock, flags);
+        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        /* free the old one */
        free_one_irq_desc(old_desc, desc);
@@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        return desc;
 out_unlock:
-        spin_unlock_irqrestore(&sparse_irq_lock, flags);
+        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        return desc;
 }
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e7986..0d4005d85b0 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
-                spin_lock_irqsave(&desc->lock, flags);
+                raw_spin_lock_irqsave(&desc->lock, flags);
                __disable_irq(desc, irq, true);
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
        for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
                if (!(desc->status & IRQ_SUSPENDED))
                        continue;
-                spin_lock_irqsave(&desc->lock, flags);
+                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
 EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145fea9..6f50eccc79c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -179,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
        unsigned long flags;
        int ret = 1;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        for (action = desc->action ; action; action = action->next) {
                if ((action != new_action) && action->name &&
                                !strcmp(new_action->name, action->name)) {
@@ -187,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
                        break;
                }
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 22b0a6eedf2..89fb90ae534 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
        struct irqaction *action;
        int ok = 0, work = 0;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        /* Already running on another processor */
        if (desc->status & IRQ_INPROGRESS) {
                /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
                 */
                if (desc->action && (desc->action->flags & IRQF_SHARED))
                        desc->status |= IRQ_PENDING;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                return ok;
        }
        /* Honour the normal IRQ locking */
        desc->status |= IRQ_INPROGRESS;
        action = desc->action;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        while (action) {
                /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
        }
        local_irq_disable();
        /* Now clean up the flags */
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        action = desc->action;
        /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
                 * Perform real IRQ processing for the IRQ we deferred
                 */
                work = 1;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                handle_IRQ_event(irq, action);
-                spin_lock(&desc->lock);
+                raw_spin_lock(&desc->lock);
                desc->status &= ~IRQ_PENDING;
        }
        desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
         */
        if (work && desc->chip && desc->chip->end)
                desc->chip->end(irq);
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        return ok;
 }
@@ -220,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
-                 * otherwise the couter becomes a doomsday timer for otherwise
+                 * otherwise the counter becomes a doomsday timer for otherwise
                 * working systems
                 */
                if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede52..d802883153d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 {
        cputime_t cval, nval, cinterval, ninterval;
        s64 ns_ninterval, ns_nval;
+        u32 error, incr_error;
        struct cpu_itimer *it = &tsk->signal->it[clock_id];
        nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        ninterval = timeval_to_cputime(&value->it_interval);
        ns_ninterval = timeval_to_ns(&value->it_interval);
-        it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+        error = cputime_sub_ns(nval, ns_nval);
-        it->error = cputime_sub_ns(nval, ns_nval);
+        incr_error = cputime_sub_ns(ninterval, ns_ninterval);
        spin_lock_irq(&tsk->sighand->siglock);
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        }
        it->expires = nval;
        it->incr = ninterval;
+        it->error = error;
+        it->incr_error = incr_error;
        trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
                           ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f9..ef077fb7315 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
 #include <linux/hardirq.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
@@ -31,6 +31,8 @@
 #include <linux/cpu.h>
 #include <linux/console.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/kmsg_dump.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1073,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
        if (mutex_trylock(&kexec_mutex)) {
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
+                        kmsg_dump(KMSG_DUMP_KEXEC);
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
@@ -1082,6 +1087,64 @@ void crash_kexec(struct pt_regs *regs)
        }
 }
+size_t crash_get_memory_size(void)
+{
+        size_t size;
+        mutex_lock(&kexec_mutex);
+        size = crashk_res.end - crashk_res.start + 1;
+        mutex_unlock(&kexec_mutex);
+        return size;
+}
+static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+{
+        unsigned long addr;
+        for (addr = begin; addr < end; addr += PAGE_SIZE) {
+                ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+                init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+                free_page((unsigned long)__va(addr));
+                totalram_pages++;
+        }
+}
+int crash_shrink_memory(unsigned long new_size)
+{
+        int ret = 0;
+        unsigned long start, end;
+        mutex_lock(&kexec_mutex);
+        if (kexec_crash_image) {
+                ret = -ENOENT;
+                goto unlock;
+        }
+        start = crashk_res.start;
+        end = crashk_res.end;
+        if (new_size >= end - start + 1) {
+                ret = -EINVAL;
+                if (new_size == end - start + 1)
+                        ret = 0;
+                goto unlock;
+        }
+        start = roundup(start, PAGE_SIZE);
+        end = roundup(start + new_size, PAGE_SIZE);
+        free_reserved_phys_range(end, crashk_res.end);
+        if (start == end) {
+                crashk_res.end = end;
+                release_resource(&crashk_res);
+        } else
+                crashk_res.end = end - 1;
+unlock:
+        mutex_unlock(&kexec_mutex);
+        return ret;
+}
 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
                            size_t data_len)
 {
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bb..35edbe22e9a 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
 /*
- * A simple kernel FIFO implementation.
+ * A generic kernel FIFO implementation.
 *
+ * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
 *
 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
 #include <linux/err.h>
 #include <linux/kfifo.h>
 #include <linux/log2.h>
+#include <linux/uaccess.h>
+static void _kfifo_init(struct kfifo *fifo, void *buffer,
+                unsigned int size)
+{
+        fifo->buffer = buffer;
+        fifo->size = size;
+        kfifo_reset(fifo);
+}
 /**
- * kfifo_init - allocates a new FIFO using a preallocated buffer
+ * kfifo_init - initialize a FIFO using a preallocated buffer
+ * @fifo: the fifo to assign the buffer
 * @buffer: the preallocated buffer to be used.
- * @size: the size of the internal buffer, this have to be a power of 2.
+ * @size: the size of the internal buffer, this has to be a power of 2.
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- * @lock: the lock to be used to protect the fifo buffer
 *
- * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
- * &struct kfifo with kfree().
 */
-struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
-                         gfp_t gfp_mask, spinlock_t *lock)
 {
-        struct kfifo *fifo;
        /* size must be a power of 2 */
        BUG_ON(!is_power_of_2(size));
-        fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
+        _kfifo_init(fifo, buffer, size);
-        if (!fifo)
-                return ERR_PTR(-ENOMEM);
-        fifo->buffer = buffer;
-        fifo->size = size;
-        fifo->in = fifo->out = 0;
-        fifo->lock = lock;
-        return fifo;
 }
 EXPORT_SYMBOL(kfifo_init);
 /**
- * kfifo_alloc - allocates a new FIFO and its internal buffer
+ * kfifo_alloc - allocates a new FIFO internal buffer
- * @size: the size of the internal buffer to be allocated.
+ * @fifo: the fifo to assign then new buffer
+ * @size: the size of the buffer to be allocated, this have to be a power of 2.
 * @gfp_mask: get_free_pages mask, passed to kmalloc()
- * @lock: the lock to be used to protect the fifo buffer
+ *
+ * This function dynamically allocates a new fifo internal buffer
 *
 * The size will be rounded-up to a power of 2.
+ * The buffer will be release with kfifo_free().
+ * Return 0 if no error, otherwise the an error code
 */
-struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
+int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
 {
        unsigned char *buffer;
-        struct kfifo *ret;
        /*
         * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
        }
        buffer = kmalloc(size, gfp_mask);
-        if (!buffer)
+        if (!buffer) {
-                return ERR_PTR(-ENOMEM);
+                _kfifo_init(fifo, NULL, 0);
+                return -ENOMEM;
-        ret = kfifo_init(buffer, size, gfp_mask, lock);
+        }
-        if (IS_ERR(ret))
+        _kfifo_init(fifo, buffer, size);
-                kfree(buffer);
-        return ret;
+        return 0;
 }
 EXPORT_SYMBOL(kfifo_alloc);
 /**
- * kfifo_free - frees the FIFO
+ * kfifo_free - frees the FIFO internal buffer
 * @fifo: the fifo to be freed.
 */
 void kfifo_free(struct kfifo *fifo)
 {
        kfree(fifo->buffer);
-        kfree(fifo);
+        _kfifo_init(fifo, NULL, 0);
 }
 EXPORT_SYMBOL(kfifo_free);
 /**
- * __kfifo_put - puts some data into the FIFO, no locking version
+ * kfifo_skip - skip output data
 * @fifo: the fifo to be used.
- * @buffer: the data to be added.
+ * @len: number of bytes to skip
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
 */
-unsigned int __kfifo_put(struct kfifo *fifo,
+void kfifo_skip(struct kfifo *fifo, unsigned int len)
-                        const unsigned char *buffer, unsigned int len)
+{
+        if (len < kfifo_len(fifo)) {
+                __kfifo_add_out(fifo, len);
+                return;
+        }
+        kfifo_reset_out(fifo);
+}
+EXPORT_SYMBOL(kfifo_skip);
+static inline void __kfifo_in_data(struct kfifo *fifo,
+                const void *from, unsigned int len, unsigned int off)
 {
        unsigned int l;
-        len = min(len, fifo->size - fifo->in + fifo->out);
+        /*
+         * Ensure that we sample the fifo->out index -before- we
+         * start putting bytes into the kfifo.
+         */
+        smp_mb();
+        off = __kfifo_off(fifo, fifo->in + off);
+        /* first put the data starting from fifo->in to buffer end */
+        l = min(len, fifo->size - off);
+        memcpy(fifo->buffer + off, from, l);
+        /* then put the rest (if any) at the beginning of the buffer */
+        memcpy(fifo->buffer, from + l, len - l);
+}
+static inline void __kfifo_out_data(struct kfifo *fifo,
+                void *to, unsigned int len, unsigned int off)
+{
+        unsigned int l;
+        /*
+         * Ensure that we sample the fifo->in index -before- we
+         * start removing bytes from the kfifo.
+         */
+        smp_rmb();
+        off = __kfifo_off(fifo, fifo->out + off);
+        /* first get the data from fifo->out until the end of the buffer */
+        l = min(len, fifo->size - off);
+        memcpy(to, fifo->buffer + off, l);
+        /* then get the rest (if any) from the beginning of the buffer */
+        memcpy(to + l, fifo->buffer, len - l);
+}
+static inline int __kfifo_from_user_data(struct kfifo *fifo,
+         const void __user *from, unsigned int len, unsigned int off,
+         unsigned *lenout)
+{
+        unsigned int l;
+        int ret;
        /*
         * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
        smp_mb();
+        off = __kfifo_off(fifo, fifo->in + off);
        /* first put the data starting from fifo->in to buffer end */
-        l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+        l = min(len, fifo->size - off);
-        memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+        ret = copy_from_user(fifo->buffer + off, from, l);
+        if (unlikely(ret)) {
+                *lenout = ret;
+                return -EFAULT;
+        }
+        *lenout = l;
        /* then put the rest (if any) at the beginning of the buffer */
-        memcpy(fifo->buffer, buffer + l, len - l);
+        ret = copy_from_user(fifo->buffer, from + l, len - l);
+        *lenout += ret ? ret : len - l;
+        return ret ? -EFAULT : 0;
+}
+static inline int __kfifo_to_user_data(struct kfifo *fifo,
+                void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
+{
+        unsigned int l;
+        int ret;
        /*
-         * Ensure that we add the bytes to the kfifo -before-
+         * Ensure that we sample the fifo->in index -before- we
-         * we update the fifo->in index.
+         * start removing bytes from the kfifo.
         */
-        smp_wmb();
+        smp_rmb();
+        off = __kfifo_off(fifo, fifo->out + off);
+        /* first get the data from fifo->out until the end of the buffer */
+        l = min(len, fifo->size - off);
+        ret = copy_to_user(to, fifo->buffer + off, l);
+        *lenout = l;
+        if (unlikely(ret)) {
+                *lenout -= ret;
+                return -EFAULT;
+        }
+        /* then get the rest (if any) from the beginning of the buffer */
+        len -= l;
+        ret = copy_to_user(to + l, fifo->buffer, len);
+        if (unlikely(ret)) {
+                *lenout += len - ret;
+                return -EFAULT;
+        }
+        *lenout += len;
+        return 0;
+}
+unsigned int __kfifo_in_n(struct kfifo *fifo,
+        const void *from, unsigned int len, unsigned int recsize)
+{
+        if (kfifo_avail(fifo) < len + recsize)
+                return len + 1;
+        __kfifo_in_data(fifo, from, len, recsize);
+        return 0;
+}
+EXPORT_SYMBOL(__kfifo_in_n);
-        fifo->in += len;
+/**
+ * kfifo_in - puts some data into the FIFO
+ * @fifo: the fifo to be used.
+ * @from: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most @len bytes from the @from buffer into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int kfifo_in(struct kfifo *fifo, const void *from,
+                                unsigned int len)
+{
+        len = min(kfifo_avail(fifo), len);
+        __kfifo_in_data(fifo, from, len, 0);
+        __kfifo_add_in(fifo, len);
        return len;
 }
-EXPORT_SYMBOL(__kfifo_put);
+EXPORT_SYMBOL(kfifo_in);
+unsigned int __kfifo_in_generic(struct kfifo *fifo,
+        const void *from, unsigned int len, unsigned int recsize)
+{
+        return __kfifo_in_rec(fifo, from, len, recsize);
+}
+EXPORT_SYMBOL(__kfifo_in_generic);
+unsigned int __kfifo_out_n(struct kfifo *fifo,
+        void *to, unsigned int len, unsigned int recsize)
+{
+        if (kfifo_len(fifo) < len + recsize)
+                return len;
+        __kfifo_out_data(fifo, to, len, recsize);
+        __kfifo_add_out(fifo, len + recsize);
+        return 0;
+}
+EXPORT_SYMBOL(__kfifo_out_n);
 /**
- * __kfifo_get - gets some data from the FIFO, no locking version
+ * kfifo_out - gets some data from the FIFO
 * @fifo: the fifo to be used.
- * @buffer: where the data must be copied.
+ * @to: where the data must be copied.
 * @len: the size of the destination buffer.
 *
 * This function copies at most @len bytes from the FIFO into the
- * @buffer and returns the number of copied bytes.
+ * @to buffer and returns the number of copied bytes.
 *
 * Note that with only one concurrent reader and one concurrent
 * writer, you don't need extra locking to use these functions.
 */
-unsigned int __kfifo_get(struct kfifo *fifo,
+unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
-                         unsigned char *buffer, unsigned int len)
 {
-        unsigned int l;
+        len = min(kfifo_len(fifo), len);
-        len = min(len, fifo->in - fifo->out);
+        __kfifo_out_data(fifo, to, len, 0);
+        __kfifo_add_out(fifo, len);
-        /*
+        return len;
-         * Ensure that we sample the fifo->in index -before- we
+}
-         * start removing bytes from the kfifo.
+EXPORT_SYMBOL(kfifo_out);
-         */
-        smp_rmb();
+/**
+ * kfifo_out_peek - copy some data from the FIFO, but do not remove it
+ * @fifo: the fifo to be used.
+ * @to: where the data must be copied.
+ * @len: the size of the destination buffer.
+ * @offset: offset into the fifo
+ *
+ * This function copies at most @len bytes at @offset from the FIFO
+ * into the @to buffer and returns the number of copied bytes.
+ * The data is not removed from the FIFO.
+ */
+unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
+                            unsigned offset)
+{
+        len = min(kfifo_len(fifo), len + offset);
-        /* first get the data from fifo->out until the end of the buffer */
+        __kfifo_out_data(fifo, to, len, offset);
-        l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+        return len;
-        memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+}
+EXPORT_SYMBOL(kfifo_out_peek);
-        /* then get the rest (if any) from the beginning of the buffer */
+unsigned int __kfifo_out_generic(struct kfifo *fifo,
-        memcpy(buffer + l, fifo->buffer, len - l);
+        void *to, unsigned int len, unsigned int recsize,
+        unsigned int *total)
+{
+        return __kfifo_out_rec(fifo, to, len, recsize, total);
+}
+EXPORT_SYMBOL(__kfifo_out_generic);
-        /*
+unsigned int __kfifo_from_user_n(struct kfifo *fifo,
-         * Ensure that we remove the bytes from the kfifo -before-
+        const void __user *from, unsigned int len, unsigned int recsize)
-         * we update the fifo->out index.
+{
-         */
+        unsigned total;
-        smp_mb();
+        if (kfifo_avail(fifo) < len + recsize)
+                return len + 1;
-        fifo->out += len;
+        __kfifo_from_user_data(fifo, from, len, recsize, &total);
+        return total;
+}
+EXPORT_SYMBOL(__kfifo_from_user_n);
-        return len;
+/**
+ * kfifo_from_user - puts some data from user space into the FIFO
+ * @fifo: the fifo to be used.
+ * @from: pointer to the data to be added.
+ * @len: the length of the data to be added.
+ * @total: the actual returned data length.
+ *
+ * This function copies at most @len bytes from the @from into the
+ * FIFO depending and returns -EFAULT/0.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+int kfifo_from_user(struct kfifo *fifo,
+        const void __user *from, unsigned int len, unsigned *total)
+{
+        int ret;
+        len = min(kfifo_avail(fifo), len);
+        ret = __kfifo_from_user_data(fifo, from, len, 0, total);
+        if (ret)
+                return ret;
+        __kfifo_add_in(fifo, len);
+        return 0;
 }
-EXPORT_SYMBOL(__kfifo_get);
+EXPORT_SYMBOL(kfifo_from_user);
+unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
+        const void __user *from, unsigned int len, unsigned int recsize)
+{
+        return __kfifo_from_user_rec(fifo, from, len, recsize);
+}
+EXPORT_SYMBOL(__kfifo_from_user_generic);
+unsigned int __kfifo_to_user_n(struct kfifo *fifo,
+        void __user *to, unsigned int len, unsigned int reclen,
+        unsigned int recsize)
+{
+        unsigned int ret, total;
+        if (kfifo_len(fifo) < reclen + recsize)
+                return len;
+        ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
+        if (likely(ret == 0))
+                __kfifo_add_out(fifo, reclen + recsize);
+        return total;
+}
+EXPORT_SYMBOL(__kfifo_to_user_n);
+/**
+ * kfifo_to_user - gets data from the FIFO and write it to user space
+ * @fifo: the fifo to be used.
+ * @to: where the data must be copied.
+ * @len: the size of the destination buffer.
+ * @lenout: pointer to output variable with copied data
+ *
+ * This function copies at most @len bytes from the FIFO into the
+ * @to buffer and 0 or -EFAULT.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+int kfifo_to_user(struct kfifo *fifo,
+        void __user *to, unsigned int len, unsigned *lenout)
+{
+        int ret;
+        len = min(kfifo_len(fifo), len);
+        ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
+        __kfifo_add_out(fifo, *lenout);
+        return ret;
+}
+EXPORT_SYMBOL(kfifo_to_user);
+unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
+        void __user *to, unsigned int len, unsigned int recsize,
+        unsigned int *total)
+{
+        return __kfifo_to_user_rec(fifo, to, len, recsize, total);
+}
+EXPORT_SYMBOL(__kfifo_to_user_generic);
+unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
+{
+        if (recsize == 0)
+                return kfifo_avail(fifo);
+        return __kfifo_peek_n(fifo, recsize);
+}
+EXPORT_SYMBOL(__kfifo_peek_generic);
+void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
+{
+        __kfifo_skip_rec(fifo, recsize);
+}
+EXPORT_SYMBOL(__kfifo_skip_generic);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d701463402..761fdd2b303 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct		*kgdb_usethread;
 struct task_struct              *kgdb_contthread;
 int                             kgdb_single_step;
+pid_t                           kgdb_sstep_pid;
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
         */
        if (tid == 0 || tid == -1)
                tid = -atomic_read(&kgdb_active) - 2;
-        if (tid < 0) {
+        if (tid < -1 && tid > -NR_CPUS - 2) {
                if (kgdb_info[-tid - 2].task)
                        return kgdb_info[-tid - 2].task;
                else
                        return idle_task(-tid - 2);
        }
+        if (tid <= 0) {
+                printk(KERN_ERR "KGDB: Internal thread select error\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -577,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
        smp_wmb();
        atomic_set(&cpu_in_kgdb[cpu], 1);
+        /* Disable any cpu specific hw breakpoints */
+        kgdb_disable_hw_debug(regs);
        /* Wait till primary CPU is done with debugging */
        while (atomic_read(&passive_cpu_wait[cpu]))
                cpu_relax();
@@ -590,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
        /* Signal the primary CPU that we are done: */
        atomic_set(&cpu_in_kgdb[cpu], 0);
-        touch_softlockup_watchdog();
+        touch_softlockup_watchdog_sync();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
 }
@@ -619,7 +628,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 static int kgdb_activate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +639,16 @@ static int kgdb_activate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_set_breakpoint(addr,
                                kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        ret = error;
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        continue;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +695,8 @@ static int kgdb_set_sw_break(unsigned long addr)
 static int kgdb_deactivate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +705,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_remove_breakpoint(addr,
                                        kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        ret = error;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_SET;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1220,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
                return 1;
        } else {
-                error_packet(remcom_out_buffer, -EINVAL);
+                kgdb_msg_write("KGDB only knows signal 9 (pass)"
-                return 0;
+                        " and 15 (pass and disconnect)\n"
+                        "Executing a continue without signal passing\n", 0);
+                remcom_in_buffer[0] = 'c';
        }
        /* Indicate fall through */
@@ -1395,6 +1413,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
+        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
@@ -1425,15 +1444,16 @@ acquirelock:
                cpu_relax();
        /*
-         * Do not start the debugger connection on this CPU if the last
+         * For single stepping, try to only enter on the processor
-         * instance of the exception handler wanted to come into the
+         * that was single stepping.  To gaurd against a deadlock, the
-         * debugger on a different CPU via a single step
+         * kernel will only try for the value of sstep_tries before
+         * giving up and continuing on.
         */
        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+            (kgdb_info[cpu].task &&
+             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog();
+                touch_softlockup_watchdog_sync();
                clocksource_touch_watchdog();
                local_irq_restore(flags);
@@ -1524,9 +1544,16 @@ acquirelock:
        }
 kgdb_restore:
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+                if (kgdb_info[sstep_cpu].task)
+                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+                else
+                        kgdb_sstep_pid = 0;
+        }
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog();
+        touch_softlockup_watchdog_sync();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 25b10319036..bf0e231d970 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
                return -ENOMEM;
        ret = call_usermodehelper_stdinpipe(sub_info, filp);
-        if (ret < 0)
+        if (ret < 0) {
-                goto out;
+                call_usermodehelper_freeinfo(sub_info);
+                return ret;
+        }
-        return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+        ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+        if (ret < 0)    /* Failed to execute helper, close pipe */
+                filp_close(*filp, NULL);
-  out:
-        call_usermodehelper_freeinfo(sub_info);
        return ret;
 }
 EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5342a344c4..b7df302a020 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
 #ifdef CONFIG_PREEMPT
-                rp->maxactive = max(10, 2 * num_possible_cpus());
+                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
 #else
                rp->maxactive = num_possible_cpus();
 #endif
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7..3feaf5a7451 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
+static ssize_t kexec_crash_size_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%zu\n", crash_get_memory_size());
+}
+static ssize_t kexec_crash_size_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        unsigned long cnt;
+        int ret;
+        if (strict_strtoul(buf, 0, &cnt))
+                return -EINVAL;
+        ret = crash_shrink_memory(cnt);
+        return ret < 0 ? ret : count;
+}
+KERNEL_ATTR_RW(kexec_crash_size);
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
 {
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_KEXEC
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
+        &kexec_crash_size_attr.attr,
        &vmcoreinfo_attr.attr,
 #endif
        NULL
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e..fbb6222fe7e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 EXPORT_SYMBOL(kthread_create);
 /**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @p: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ */
+void kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+        /* Must have done schedule() in kthread() before we set_task_cpu */
+        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
+                WARN_ON(1);
+                return;
+        }
+        p->cpus_allowed = cpumask_of_cpu(cpu);
+        p->rt.nr_cpus_allowed = 1;
+        p->flags |= PF_THREAD_BOUND;
+}
+EXPORT_SYMBOL(kthread_bind);
+/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d315..c62ec14609b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644);
 * to use a raw spinlock - we really dont want the spinlock
 * code to recurse back into the lockdep code...
 */
-static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 static int graph_lock(void)
 {
-        __raw_spin_lock(&lockdep_lock);
+        arch_spin_lock(&lockdep_lock);
        /*
         * Make sure that if another CPU detected a bug while
         * walking the graph we dont change it (while the other
@@ -85,7 +85,7 @@ static int graph_lock(void)
         * dropped already)
         */
        if (!debug_locks) {
-                __raw_spin_unlock(&lockdep_lock);
+                arch_spin_unlock(&lockdep_lock);
                return 0;
        }
        /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +95,11 @@ static int graph_lock(void)
 static inline int graph_unlock(void)
 {
-        if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
+        if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
        current->lockdep_recursion--;
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        return 0;
 }
@@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void)
 {
        int ret = debug_locks_off();
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        return ret;
 }
@@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 }
 #ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
+                      cpu_lock_stats);
 static inline u64 lockstat_clock(void)
 {
@@ -168,7 +169,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
        if (time > lt->max)
                lt->max = time;
-        if (time < lt->min || !lt->min)
+        if (time < lt->min || !lt->nr)
                lt->min = time;
        lt->total += time;
@@ -177,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
 static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
 {
-        dst->min += src->min;
+        if (!src->nr)
-        dst->max += src->max;
+                return;
+        if (src->max > dst->max)
+                dst->max = src->max;
+        if (src->min < dst->min || !dst->nr)
+                dst->min = src->min;
        dst->total += src->total;
        dst->nr += src->nr;
 }
@@ -191,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
        memset(&stats, 0, sizeof(struct lock_class_stats));
        for_each_possible_cpu(cpu) {
                struct lock_class_stats *pcs =
-                        &per_cpu(lock_stats, cpu)[class - lock_classes];
+                        &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
                for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
                        stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
        for_each_possible_cpu(cpu) {
                struct lock_class_stats *cpu_stats =
-                        &per_cpu(lock_stats, cpu)[class - lock_classes];
+                        &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
                memset(cpu_stats, 0, sizeof(struct lock_class_stats));
        }
@@ -228,12 +236,12 @@ void clear_lock_stats(struct lock_class *class)
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
 {
-        return &get_cpu_var(lock_stats)[class - lock_classes];
+        return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
 }
 static void put_lock_stats(struct lock_class_stats *stats)
 {
-        put_cpu_var(lock_stats);
+        put_cpu_var(cpu_lock_stats);
 }
 static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
         * complete trace that maxes out the entries provided will be reported
         * as incomplete, friggin useless </rant>
         */
-        if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+        if (trace->nr_entries != 0 &&
+            trace->entries[trace->nr_entries-1] == ULONG_MAX)
                trace->nr_entries--;
        trace->max_entries = trace->nr_entries;
@@ -1161,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
        this.class = class;
        local_irq_save(flags);
-        __raw_spin_lock(&lockdep_lock);
+        arch_spin_lock(&lockdep_lock);
        ret = __lockdep_count_forward_deps(&this);
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        local_irq_restore(flags);
        return ret;
@@ -1188,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
        this.class = class;
        local_irq_save(flags);
-        __raw_spin_lock(&lockdep_lock);
+        arch_spin_lock(&lockdep_lock);
        ret = __lockdep_count_backward_deps(&this);
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        local_irq_restore(flags);
        return ret;
@@ -2138,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
                return ret;
        return print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 1, irqclass);
+                                        this, 0, irqclass);
 }
 void print_irqtrace_events(struct task_struct *curr)
diff --git a/kernel/module.c b/kernel/module.c
index 5842a71cf05..f82386bd9ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
-#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 static void *percpu_modalloc(unsigned long size, unsigned long align,
                             const char *name)
 {
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
        free_percpu(freeme);
 }
-#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
-/* Number of blocks used and allocated. */
-static unsigned int pcpu_num_used, pcpu_num_allocated;
-/* Size of each block.  -ve means used. */
-static int *pcpu_size;
-static int split_block(unsigned int i, unsigned short size)
-{
-        /* Reallocation required? */
-        if (pcpu_num_used + 1 > pcpu_num_allocated) {
-                int *new;
-                new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
-                               GFP_KERNEL);
-                if (!new)
-                        return 0;
-                pcpu_num_allocated *= 2;
-                pcpu_size = new;
-        }
-        /* Insert a new subblock */
-        memmove(&pcpu_size[i+1], &pcpu_size[i],
-                sizeof(pcpu_size[0]) * (pcpu_num_used - i));
-        pcpu_num_used++;
-        pcpu_size[i+1] -= size;
-        pcpu_size[i] = size;
-        return 1;
-}
-static inline unsigned int block_size(int val)
-{
-        if (val < 0)
-                return -val;
-        return val;
-}
-static void *percpu_modalloc(unsigned long size, unsigned long align,
-                             const char *name)
-{
-        unsigned long extra;
-        unsigned int i;
-        void *ptr;
-        int cpu;
-        if (align > PAGE_SIZE) {
-                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
-                align = PAGE_SIZE;
-        }
-        ptr = __per_cpu_start;
-        for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
-                /* Extra for alignment requirement. */
-                extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
-                BUG_ON(i == 0 && extra != 0);
-                if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
-                        continue;
-                /* Transfer extra to previous block. */
-                if (pcpu_size[i-1] < 0)
-                        pcpu_size[i-1] -= extra;
-                else
-                        pcpu_size[i-1] += extra;
-                pcpu_size[i] -= extra;
-                ptr += extra;
-                /* Split block if warranted */
-                if (pcpu_size[i] - size > sizeof(unsigned long))
-                        if (!split_block(i, size))
-                                return NULL;
-                /* add the per-cpu scanning areas */
-                for_each_possible_cpu(cpu)
-                        kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
-                                       GFP_KERNEL);
-                /* Mark allocated */
-                pcpu_size[i] = -pcpu_size[i];
-                return ptr;
-        }
-        printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
-               size);
-        return NULL;
-}
-static void percpu_modfree(void *freeme)
-{
-        unsigned int i;
-        void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
-        int cpu;
-        /* First entry is core kernel percpu data. */
-        for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
-                if (ptr == freeme) {
-                        pcpu_size[i] = -pcpu_size[i];
-                        goto free;
-                }
-        }
-        BUG();
- free:
-        /* remove the per-cpu scanning areas */
-        for_each_possible_cpu(cpu)
-                kmemleak_free(freeme + per_cpu_offset(cpu));
-        /* Merge with previous? */
-        if (pcpu_size[i-1] >= 0) {
-                pcpu_size[i-1] += pcpu_size[i];
-                pcpu_num_used--;
-                memmove(&pcpu_size[i], &pcpu_size[i+1],
-                        (pcpu_num_used - i) * sizeof(pcpu_size[0]));
-                i--;
-        }
-        /* Merge with next? */
-        if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
-                pcpu_size[i] += pcpu_size[i+1];
-                pcpu_num_used--;
-                memmove(&pcpu_size[i+1], &pcpu_size[i+2],
-                        (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
-        }
-}
-static int percpu_modinit(void)
-{
-        pcpu_num_used = 2;
-        pcpu_num_allocated = 2;
-        pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
-                            GFP_KERNEL);
-        /* Static in-kernel percpu data (used). */
-        pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
-        /* Free room. */
-        pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
-        if (pcpu_size[1] < 0) {
-                printk(KERN_ERR "No per-cpu room for modules.\n");
-                pcpu_num_used = 1;
-        }
-        return 0;
-}
-__initcall(percpu_modinit);
-#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                 Elf_Shdr *sechdrs,
                                 const char *secstrings)
@@ -1030,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
 }
 #ifdef CONFIG_MODVERSIONS
+/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
+static unsigned long maybe_relocated(unsigned long crc,
+                                     const struct module *crc_owner)
+{
+#ifdef ARCH_RELOCATES_KCRCTAB
+        if (crc_owner == NULL)
+                return crc - (unsigned long)reloc_start;
+#endif
+        return crc;
+}
 static int check_version(Elf_Shdr *sechdrs,
                         unsigned int versindex,
                         const char *symname,
                         struct module *mod, 
-                         const unsigned long *crc)
+                         const unsigned long *crc,
+                         const struct module *crc_owner)
 {
        unsigned int i, num_versions;
        struct modversion_info *versions;
@@ -1055,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs,
                if (strcmp(versions[i].name, symname) != 0)
                        continue;
-                if (versions[i].crc == *crc)
+                if (versions[i].crc == maybe_relocated(*crc, crc_owner))
                        return 1;
                DEBUGP("Found checksum %lX vs module %lX\n",
-                       *crc, versions[i].crc);
+                       maybe_relocated(*crc, crc_owner), versions[i].crc);
                goto bad_version;
        }
@@ -1081,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
        if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
                         &crc, true, false))
                BUG();
-        return check_version(sechdrs, versindex, "module_layout", mod, crc);
+        return check_version(sechdrs, versindex, "module_layout", mod, crc,
+                             NULL);
 }
 /* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
                                unsigned int versindex,
                                const char *symname,
                                struct module *mod, 
-                                const unsigned long *crc)
+                                const unsigned long *crc,
+                                const struct module *crc_owner)
 {
        return 1;
 }
@@ -1134,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
        /* use_module can fail due to OOM,
           or module initialization or unloading */
        if (sym) {
-                if (!check_version(sechdrs, versindex, name, mod, crc) ||
+                if (!check_version(sechdrs, versindex, name, mod, crc, owner)
-                    !use_module(mod, owner))
+                    || !use_module(mod, owner))
                        sym = NULL;
        }
        return sym;
@@ -1146,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 * J. Corbet <corbet@lwn.net>
 */
 #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+        return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
 struct module_sect_attr
 {
        struct module_attribute mattr;
@@ -1187,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        /* Count loaded sections and allocate structures */
        for (i = 0; i < nsect; i++)
-                if (sechdrs[i].sh_flags & SHF_ALLOC
+                if (!sect_empty(&sechdrs[i]))
-                    && sechdrs[i].sh_size)
                        nloaded++;
        size[0] = ALIGN(sizeof(*sect_attrs)
                        + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1206,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        sattr = &sect_attrs->attrs[0];
        gattr = &sect_attrs->grp.attrs[0];
        for (i = 0; i < nsect; i++) {
-                if (! (sechdrs[i].sh_flags & SHF_ALLOC))
+                if (sect_empty(&sechdrs[i]))
-                        continue;
-                if (!sechdrs[i].sh_size)
                        continue;
                sattr->address = sechdrs[i].sh_addr;
                sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1292,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
        /* Count notes sections and allocate structures.  */
        notes = 0;
        for (i = 0; i < nsect; i++)
-                if ((sechdrs[i].sh_flags & SHF_ALLOC) &&
+                if (!sect_empty(&sechdrs[i]) &&
                    (sechdrs[i].sh_type == SHT_NOTE))
                        ++notes;
@@ -1308,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
        notes_attrs->notes = notes;
        nattr = &notes_attrs->attrs[0];
        for (loaded = i = 0; i < nsect; ++i) {
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                if (sect_empty(&sechdrs[i]))
                        continue;
                if (sechdrs[i].sh_type == SHT_NOTE) {
                        nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
@@ -2046,9 +1913,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
        unsigned int i;
        /* only scan the sections containing data */
-        kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+        kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-                           (unsigned long)mod->module_core,
-                           sizeof(struct module), GFP_KERNEL);
        for (i = 1; i < hdr->e_shnum; i++) {
                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2057,8 +1922,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
                    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
                        continue;
-                kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+                kmemleak_scan_area((void *)sechdrs[i].sh_addr,
-                                   (unsigned long)mod->module_core,
                                   sechdrs[i].sh_size, GFP_KERNEL);
        }
 }
@@ -2386,6 +2250,12 @@ static noinline struct module *load_module(void __user *umod,
                                         "_ftrace_events",
                                         sizeof(*mod->trace_events),
                                         &mod->num_trace_events);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+                           mod->num_trace_events, GFP_KERNEL);
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a..57d527a16f9 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
                                                        \
                DEBUG_LOCKS_WARN_ON(in_interrupt());    \
                local_irq_save(flags);                  \
-                __raw_spin_lock(&(lock)->raw_lock);     \
+                arch_spin_lock(&(lock)->rlock.raw_lock);\
                DEBUG_LOCKS_WARN_ON(l->magic != l);     \
        } while (0)
-#define spin_unlock_mutex(lock, flags)                  \
+#define spin_unlock_mutex(lock, flags)                          \
-        do {                                            \
+        do {                                                    \
-                __raw_spin_unlock(&(lock)->raw_lock);   \
+                arch_spin_unlock(&(lock)->rlock.raw_lock);      \
-                local_irq_restore(flags);               \
+                local_irq_restore(flags);                       \
-                preempt_check_resched();                \
+                preempt_check_resched();                        \
        } while (0)
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba..c787333282b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
 */
 #include <linux/debug_locks.h>
 #include <linux/interrupt.h>
+#include <linux/kmsg_dump.h>
 #include <linux/kallsyms.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
@@ -81,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...)
         */
        crash_kexec(NULL);
+        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
@@ -339,6 +342,7 @@ void oops_exit(void)
 {
        do_oops_enter_exit();
        print_oops_end_marker();
+        kmsg_dump(KMSG_DUMP_OOPS);
 }
 #ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index d656c276508..cf1b6918312 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/string.h>
 #if 0
 #define DEBUGP printk
@@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val)
                next = args + i;
        /* Chew up trailing spaces. */
-        while (isspace(*next))
+        return skip_spaces(next);
-                next++;
-        return next;
 }
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +138,7 @@ int parse_args(const char *name,
        DEBUGP("Parsing ARGS: %s\n", args);
        /* Chew leading spaces */
-        while (isspace(*args))
+        args = skip_spaces(args);
-                args++;
        while (*args) {
                int ret;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6b7ddba1dd6..2b19297742c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -36,7 +36,7 @@
 /*
 * Each CPU has a list of per CPU events:
 */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 int perf_max_events __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
@@ -203,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
-                spin_lock_irqsave(&ctx->lock, *flags);
+                raw_spin_lock_irqsave(&ctx->lock, *flags);
                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
-                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        goto retry;
                }
                if (!atomic_inc_not_zero(&ctx->refcount)) {
-                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        ctx = NULL;
                }
        }
@@ -231,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
-                spin_unlock_irqrestore(&ctx->lock, flags);
+                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
 }
@@ -240,9 +240,9 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 {
        unsigned long flags;
-        spin_lock_irqsave(&ctx->lock, flags);
+        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
-        spin_unlock_irqrestore(&ctx->lock, flags);
+        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        put_ctx(ctx);
 }
@@ -427,7 +427,7 @@ static void __perf_event_remove_from_context(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        /*
         * Protect the list operation against NMI by disabling the
         * events on a global level.
@@ -449,7 +449,7 @@ static void __perf_event_remove_from_context(void *info)
        }
        perf_enable();
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
@@ -476,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
        if (!task) {
                /*
                 * Per cpu events are removed via an smp call and
-                 * the removal is always sucessful.
+                 * the removal is always successful.
                 */
                smp_call_function_single(event->cpu,
                                         __perf_event_remove_from_context,
@@ -488,12 +488,12 @@ retry:
        task_oncpu_function_call(task, __perf_event_remove_from_context,
                                 event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * If the context is active we need to retry the smp call.
         */
        if (ctx->nr_active && !list_empty(&event->group_entry)) {
-                spin_unlock_irq(&ctx->lock);
+                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
@@ -504,7 +504,7 @@ retry:
         */
        if (!list_empty(&event->group_entry))
                list_del_event(event, ctx);
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 /*
@@ -535,7 +535,7 @@ static void __perf_event_disable(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        /*
         * If the event is on, turn it off.
@@ -551,7 +551,7 @@ static void __perf_event_disable(void *info)
                event->state = PERF_EVENT_STATE_OFF;
        }
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -567,7 +567,7 @@ static void __perf_event_disable(void *info)
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
-static void perf_event_disable(struct perf_event *event)
+void perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -584,12 +584,12 @@ static void perf_event_disable(struct perf_event *event)
 retry:
        task_oncpu_function_call(task, __perf_event_disable, event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * If the event is still active, we need to retry the cross-call.
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
-                spin_unlock_irq(&ctx->lock);
+                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
@@ -602,7 +602,7 @@ static void perf_event_disable(struct perf_event *event)
                event->state = PERF_EVENT_STATE_OFF;
        }
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 static int
@@ -770,7 +770,7 @@ static void __perf_install_in_context(void *info)
                cpuctx->task_ctx = ctx;
        }
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
@@ -782,6 +782,9 @@ static void __perf_install_in_context(void *info)
        add_event_to_ctx(event, ctx);
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                goto unlock;
        /*
         * Don't put the event on if it is disabled or if
         * it is in a group and the group isn't on.
@@ -820,7 +823,7 @@ static void __perf_install_in_context(void *info)
 unlock:
        perf_enable();
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -845,7 +848,7 @@ perf_install_in_context(struct perf_event_context *ctx,
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
-                 * the install is always sucessful.
+                 * the install is always successful.
                 */
                smp_call_function_single(cpu, __perf_install_in_context,
                                         event, 1);
@@ -856,12 +859,12 @@ retry:
        task_oncpu_function_call(task, __perf_install_in_context,
                                 event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * we need to retry the smp call.
         */
        if (ctx->is_active && list_empty(&event->group_entry)) {
-                spin_unlock_irq(&ctx->lock);
+                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
@@ -872,7 +875,7 @@ retry:
         */
        if (list_empty(&event->group_entry))
                add_event_to_ctx(event, ctx);
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 /*
@@ -917,7 +920,7 @@ static void __perf_event_enable(void *info)
                cpuctx->task_ctx = ctx;
        }
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
@@ -925,6 +928,9 @@ static void __perf_event_enable(void *info)
                goto unlock;
        __perf_event_mark_enabled(event, ctx);
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                goto unlock;
        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
@@ -959,7 +965,7 @@ static void __perf_event_enable(void *info)
        }
 unlock:
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -971,7 +977,7 @@ static void __perf_event_enable(void *info)
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
-static void perf_event_enable(struct perf_event *event)
+void perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -985,7 +991,7 @@ static void perf_event_enable(struct perf_event *event)
                return;
        }
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                goto out;
@@ -1000,10 +1006,10 @@ static void perf_event_enable(struct perf_event *event)
                event->state = PERF_EVENT_STATE_OFF;
 retry:
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
        task_oncpu_function_call(task, __perf_event_enable, event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * If the context is active and the event is still off,
@@ -1020,7 +1026,7 @@ static void perf_event_enable(struct perf_event *event)
                __perf_event_mark_enabled(event, ctx);
 out:
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1042,7 +1048,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
                goto out;
@@ -1055,7 +1061,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
        }
        perf_enable();
 out:
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1193,8 +1199,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
-                spin_lock(&ctx->lock);
+                raw_spin_lock(&ctx->lock);
-                spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {
                        /*
                         * XXX do we need a memory barrier of sorts
@@ -1208,8 +1214,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                        perf_event_sync_stat(ctx, next_ctx);
                }
-                spin_unlock(&next_ctx->lock);
+                raw_spin_unlock(&next_ctx->lock);
-                spin_unlock(&ctx->lock);
+                raw_spin_unlock(&ctx->lock);
        }
        rcu_read_unlock();
@@ -1251,7 +1257,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
        struct perf_event *event;
        int can_add_hw = 1;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
                goto out;
@@ -1306,7 +1312,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
        }
        perf_enable();
 out:
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1370,11 +1376,14 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
        struct hw_perf_event *hwc;
        u64 interrupts, freq;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                        continue;
                hwc = &event->hw;
                interrupts = hwc->interrupts;
@@ -1425,7 +1434,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                        perf_enable();
                }
        }
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1438,7 +1447,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
        if (!ctx->nr_events)
                return;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        /*
         * Rotate the first entry last (works just fine for group events too):
         */
@@ -1449,7 +1458,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
        }
        perf_enable();
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 void perf_event_task_tick(struct task_struct *curr, int cpu)
@@ -1498,7 +1507,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        __perf_event_task_sched_out(ctx);
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        list_for_each_entry(event, &ctx->group_list, group_entry) {
                if (!event->attr.enable_on_exec)
@@ -1516,7 +1525,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        if (enabled)
                unclone_ctx(ctx);
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
        perf_event_task_sched_in(task, smp_processor_id());
 out:
@@ -1542,10 +1551,10 @@ static void __perf_event_read(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        update_context_time(ctx);
        update_event_times(event);
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
        event->pmu->read(event);
 }
@@ -1563,10 +1572,10 @@ static u64 perf_event_read(struct perf_event *event)
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
-                spin_lock_irqsave(&ctx->lock, flags);
+                raw_spin_lock_irqsave(&ctx->lock, flags);
                update_context_time(ctx);
                update_event_times(event);
-                spin_unlock_irqrestore(&ctx->lock, flags);
+                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return atomic64_read(&event->count);
@@ -1579,8 +1588,7 @@ static void
 __perf_event_init_context(struct perf_event_context *ctx,
                            struct task_struct *task)
 {
-        memset(ctx, 0, sizeof(*ctx));
+        raw_spin_lock_init(&ctx->lock);
-        spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->group_list);
        INIT_LIST_HEAD(&ctx->event_list);
@@ -1596,15 +1604,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        unsigned long flags;
        int err;
-        /*
+        if (pid == -1 && cpu != -1) {
-         * If cpu is not a wildcard then this is a percpu event:
-         */
-        if (cpu != -1) {
                /* Must be root to operate on a CPU event: */
                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu > num_possible_cpus())
+                if (cpu < 0 || cpu >= nr_cpumask_bits)
                        return ERR_PTR(-EINVAL);
                /*
@@ -1612,7 +1617,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
                 * offline CPU and activate it when the CPU comes up, but
                 * that's for later.
                 */
-                if (!cpu_isset(cpu, cpu_online_map))
+                if (!cpu_online(cpu))
                        return ERR_PTR(-ENODEV);
                cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1650,11 +1655,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                unclone_ctx(ctx);
-                spin_unlock_irqrestore(&ctx->lock, flags);
+                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        if (!ctx) {
-                ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
@@ -1988,7 +1993,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (!value)
                return -EINVAL;
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        if (event->attr.freq) {
                if (value > sysctl_perf_event_sample_rate) {
                        ret = -EINVAL;
@@ -2001,7 +2006,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
                event->hw.sample_period = value;
        }
 unlock:
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
        return ret;
 }
@@ -3263,6 +3268,12 @@ static void perf_event_task_output(struct perf_event *event,
 static int perf_event_task_match(struct perf_event *event)
 {
+        if (event->state != PERF_EVENT_STATE_ACTIVE)
+                return 0;
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (event->attr.comm || event->attr.mmap || event->attr.task)
                return 1;
@@ -3288,12 +3299,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
-        put_cpu_var(perf_cpu_context);
        if (!ctx)
                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
        if (ctx)
                perf_event_task_ctx(ctx, task_event);
+        put_cpu_var(perf_cpu_context);
        rcu_read_unlock();
 }
@@ -3370,6 +3380,12 @@ static void perf_event_comm_output(struct perf_event *event,
 static int perf_event_comm_match(struct perf_event *event)
 {
+        if (event->state != PERF_EVENT_STATE_ACTIVE)
+                return 0;
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (event->attr.comm)
                return 1;
@@ -3406,15 +3422,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-        put_cpu_var(perf_cpu_context);
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_comm_ctx(ctx, comm_event);
+        put_cpu_var(perf_cpu_context);
        rcu_read_unlock();
 }
@@ -3489,6 +3500,12 @@ static void perf_event_mmap_output(struct perf_event *event,
 static int perf_event_mmap_match(struct perf_event *event,
                                   struct perf_mmap_event *mmap_event)
 {
+        if (event->state != PERF_EVENT_STATE_ACTIVE)
+                return 0;
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (event->attr.mmap)
                return 1;
@@ -3562,15 +3579,10 @@ got_name:
        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
-        put_cpu_var(perf_cpu_context);
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_mmap_ctx(ctx, mmap_event);
+        put_cpu_var(perf_cpu_context);
        rcu_read_unlock();
        kfree(buf);
@@ -3861,6 +3873,9 @@ static int perf_swevent_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (!perf_swevent_is_counting(event))
                return 0;
@@ -4011,6 +4026,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
        data.addr = 0;
+        data.raw = NULL;
        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
@@ -4080,8 +4096,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
        u64 now;
        now = cpu_clock(cpu);
-        prev = atomic64_read(&event->hw.prev_count);
+        prev = atomic64_xchg(&event->hw.prev_count, now);
-        atomic64_set(&event->hw.prev_count, now);
        atomic64_add(now - prev, &event->count);
 }
@@ -4286,15 +4301,8 @@ static void bp_perf_event_destroy(struct perf_event *event)
 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 {
        int err;
-        /*
-         * The breakpoint is already filled if we haven't created the counter
+        err = register_perf_hw_breakpoint(bp);
-         * through perf syscall
-         * FIXME: manage to get trigerred to NULL if it comes from syscalls
-         */
-        if (!bp->callback)
-                err = register_perf_hw_breakpoint(bp);
-        else
-                err = __register_perf_hw_breakpoint(bp);
        if (err)
                return ERR_PTR(err);
@@ -4308,6 +4316,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
+        sample.raw = NULL;
        sample.addr = bp->attr.bp_addr;
        if (!perf_exclude_event(bp, regs))
@@ -4390,7 +4399,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                   struct perf_event_context *ctx,
                   struct perf_event *group_leader,
                   struct perf_event *parent_event,
-                   perf_callback_t callback,
+                   perf_overflow_handler_t overflow_handler,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -4433,10 +4442,10 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
-        if (!callback && parent_event)
+        if (!overflow_handler && parent_event)
-                callback = parent_event->callback;
+                overflow_handler = parent_event->overflow_handler;
        
-        event->callback = callback;
+        event->overflow_handler = overflow_handler;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -4571,7 +4580,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->type >= PERF_TYPE_MAX)
                return -EINVAL;
-        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+        if (attr->__reserved_1)
                return -EINVAL;
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4724,7 +4733,7 @@ SYSCALL_DEFINE5(perf_event_open,
        if (IS_ERR(event))
                goto err_put_context;
-        err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
+        err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
        if (err < 0)
                goto err_free_put_context;
@@ -4776,7 +4785,8 @@ err_put_context:
 */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                 pid_t pid, perf_callback_t callback)
+                                 pid_t pid,
+                                 perf_overflow_handler_t overflow_handler)
 {
        struct perf_event *event;
        struct perf_event_context *ctx;
@@ -4793,7 +4803,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        }
        event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                     NULL, callback, GFP_KERNEL);
+                                 NULL, overflow_handler, GFP_KERNEL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_put_context;
@@ -4998,7 +5008,7 @@ void perf_event_exit_task(struct task_struct *child)
         * reading child->perf_event_ctxp, we wait until it has
         * incremented the context's refcount before we do put_ctx below.
         */
-        spin_lock(&child_ctx->lock);
+        raw_spin_lock(&child_ctx->lock);
        child->perf_event_ctxp = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
@@ -5007,7 +5017,7 @@ void perf_event_exit_task(struct task_struct *child)
         */
        unclone_ctx(child_ctx);
        update_context_time(child_ctx);
-        spin_unlock_irqrestore(&child_ctx->lock, flags);
+        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
         * Report the task dead after unscheduling the events so that we
@@ -5090,7 +5100,7 @@ again:
 */
 int perf_event_init_task(struct task_struct *child)
 {
-        struct perf_event_context *child_ctx, *parent_ctx;
+        struct perf_event_context *child_ctx = NULL, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
@@ -5106,20 +5116,6 @@ int perf_event_init_task(struct task_struct *child)
                return 0;
        /*
-         * This is executed from the parent task context, so inherit
-         * events that have been marked for cloning.
-         * First allocate and initialize a context for the child.
-         */
-        child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-        if (!child_ctx)
-                return -ENOMEM;
-        __perf_event_init_context(child_ctx, child);
-        child->perf_event_ctxp = child_ctx;
-        get_task_struct(child);
-        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
@@ -5149,6 +5145,26 @@ int perf_event_init_task(struct task_struct *child)
                        continue;
                }
+                if (!child->perf_event_ctxp) {
+                        /*
+                         * This is executed from the parent task context, so
+                         * inherit events that have been marked for cloning.
+                         * First allocate and initialize a context for the
+                         * child.
+                         */
+                        child_ctx = kzalloc(sizeof(struct perf_event_context),
+                                            GFP_KERNEL);
+                        if (!child_ctx) {
+                                ret = -ENOMEM;
+                                break;
+                        }
+                        __perf_event_init_context(child_ctx, child);
+                        child->perf_event_ctxp = child_ctx;
+                        get_task_struct(child);
+                }
                ret = inherit_group(event, parent, parent_ctx,
                                             child, child_ctx);
                if (ret) {
@@ -5157,7 +5173,7 @@ int perf_event_init_task(struct task_struct *child)
                }
        }
-        if (inherited_all) {
+        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
@@ -5291,11 +5307,11 @@ perf_set_reserve_percpu(struct sysdev_class *class,
        perf_reserved_percpu = val;
        for_each_online_cpu(cpu) {
                cpuctx = &per_cpu(perf_cpu_context, cpu);
-                spin_lock_irq(&cpuctx->ctx.lock);
+                raw_spin_lock_irq(&cpuctx->ctx.lock);
                mpt = min(perf_max_events - cpuctx->ctx.nr_events,
                          perf_max_events - perf_reserved_percpu);
                cpuctx->max_pertask = mpt;
-                spin_unlock_irq(&cpuctx->ctx.lock);
+                raw_spin_unlock_irq(&cpuctx->ctx.lock);
        }
        spin_unlock(&perf_resource_lock);
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9..2e17c9c92cb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                         * installing it:
                         */
                        spin_lock_irq(&pidmap_lock);
-                        if (map->page)
+                        if (!map->page) {
-                                kfree(page);
-                        else
                                map->page = page;
+                                page = NULL;
+                        }
                        spin_unlock_irq(&pidmap_lock);
+                        kfree(page);
                        if (unlikely(!map->page))
                                break;
                }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
+        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
-        for (i = ns->level; i >= 0; i--) {
+        for ( ; upid >= pid->numbers; --upid)
-                upid = &pid->numbers[i];
                hlist_add_head_rcu(&upid->pid_chain,
                                &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
-        }
        spin_unlock_irq(&pidmap_lock);
 out:
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b..3db49b9ca37 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-#define PID_NAME_LEN sizeof("process_1234567890")
+#define PID_NAME_LEN 32
-static char name[PID_NAME_LEN];
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
        int ret;
        long pm_qos_class;
+        char name[PID_NAME_LEN];
-        lock_kernel();
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
                filp->private_data = (void *)pm_qos_class;
-                sprintf(name, "process_%d", current->pid);
+                snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
                ret = pm_qos_add_requirement(pm_qos_class, name,
                                        PM_QOS_DEFAULT_VALUE);
-                if (ret >= 0) {
+                if (ret >= 0)
-                        unlock_kernel();
                        return 0;
-                }
        }
-        unlock_kernel();
        return -EPERM;
 }
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_remove_requirement(pm_qos_class, name);
        return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 {
        s32 value;
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
        if (count != sizeof(s32))
                return -EINVAL;
        if (copy_from_user(&value, buf, sizeof(s32)))
                return -EFAULT;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_update_requirement(pm_qos_class, name, value);
        return  sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747..438ff452351 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 /*
 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
- * This is called from sys_timer_create with the new timer already locked.
+ * This is called from sys_timer_create() and do_cpu_nanosleep() with the
+ * new timer already all-zeros initialized.
 */
 int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                return -EINVAL;
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        new_timer->it.cpu.incr.sched = 0;
-        new_timer->it.cpu.expires.sched = 0;
        read_lock(&tasklist_lock);
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1d..218e5af9015 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
 #include <linux/vt_kern.h>
 #include <linux/kbd_kern.h>
-#include <linux/console.h>
+#include <linux/vt.h>
 #include <linux/module.h>
 #include "power.h"
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
        if (orig_fgconsole < 0)
                return 1;
-        orig_kmsg = kmsg_redirect;
+        orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
-        kmsg_redirect = SUSPEND_CONSOLE;
        return 0;
 }
@@ -30,7 +29,7 @@ void pm_restore_console(void)
 {
        if (orig_fgconsole >= 0) {
                vt_move_to_console(orig_fgconsole, 0);
-                kmsg_redirect = orig_kmsg;
+                vt_kmsg_redirect(orig_kmsg);
        }
 }
 #endif
diff --git a/kernel/printk.c b/kernel/printk.c
index b5ac4d99c66..1751c456b71 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/ratelimit.h>
+#include <linux/kmsg_dump.h>
 #include <asm/uaccess.h>
@@ -1405,4 +1406,123 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
        return false;
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
+static DEFINE_SPINLOCK(dump_list_lock);
+static LIST_HEAD(dump_list);
+/**
+ * kmsg_dump_register - register a kernel log dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Adds a kernel log dumper to the system. The dump callback in the
+ * structure will be called when the kernel oopses or panics and must be
+ * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
+ */
+int kmsg_dump_register(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        int err = -EBUSY;
+        /* The dump callback needs to be set */
+        if (!dumper->dump)
+                return -EINVAL;
+        spin_lock_irqsave(&dump_list_lock, flags);
+        /* Don't allow registering multiple times */
+        if (!dumper->registered) {
+                dumper->registered = 1;
+                list_add_tail(&dumper->list, &dump_list);
+                err = 0;
+        }
+        spin_unlock_irqrestore(&dump_list_lock, flags);
+        return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_register);
+/**
+ * kmsg_dump_unregister - unregister a kmsg dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Removes a dump device from the system. Returns zero on success and
+ * %-EINVAL otherwise.
+ */
+int kmsg_dump_unregister(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        int err = -EINVAL;
+        spin_lock_irqsave(&dump_list_lock, flags);
+        if (dumper->registered) {
+                dumper->registered = 0;
+                list_del(&dumper->list);
+                err = 0;
+        }
+        spin_unlock_irqrestore(&dump_list_lock, flags);
+        return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+static const char const *kmsg_reasons[] = {
+        [KMSG_DUMP_OOPS]        = "oops",
+        [KMSG_DUMP_PANIC]       = "panic",
+        [KMSG_DUMP_KEXEC]       = "kexec",
+};
+static const char *kmsg_to_str(enum kmsg_dump_reason reason)
+{
+        if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
+                return "unknown";
+        return kmsg_reasons[reason];
+}
+/**
+ * kmsg_dump - dump kernel log to kernel message dumpers.
+ * @reason: the reason (oops, panic etc) for dumping
+ *
+ * Iterate through each of the dump devices and call the oops/panic
+ * callbacks with the log buffer.
+ */
+void kmsg_dump(enum kmsg_dump_reason reason)
+{
+        unsigned long end;
+        unsigned chars;
+        struct kmsg_dumper *dumper;
+        const char *s1, *s2;
+        unsigned long l1, l2;
+        unsigned long flags;
+        /* Theoretically, the log could move on after we do this, but
+           there's not a lot we can do about that. The new messages
+           will overwrite the start of what we dump. */
+        spin_lock_irqsave(&logbuf_lock, flags);
+        end = log_end & LOG_BUF_MASK;
+        chars = logged_chars;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
+        if (logged_chars > end) {
+                s1 = log_buf + log_buf_len - logged_chars + end;
+                l1 = logged_chars - end;
+                s2 = log_buf;
+                l2 = end;
+        } else {
+                s1 = "";
+                l1 = 0;
+                s2 = log_buf + end - logged_chars;
+                l2 = logged_chars;
+        }
+        if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
+                printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
+                                kmsg_to_str(reason));
+                return;
+        }
+        list_for_each_entry(dumper, &dump_list, list)
+                dumper->dump(dumper, reason, s1, l1, s2, l2);
+        spin_unlock_irqrestore(&dump_list_lock, flags);
+}
 #endif
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a621a67ef4e..9bb52177af0 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -763,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        ++__get_cpu_var(rcu_torture_count)[pipe_count];
+        __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
-        ++__get_cpu_var(rcu_torture_batch)[completed];
+        __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
 }
@@ -818,13 +818,13 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                ++__get_cpu_var(rcu_torture_count)[pipe_count];
+                __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
-                ++__get_cpu_var(rcu_torture_batch)[completed];
+                __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3..c705a41b4ba 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
        relay_consume_bytes(rbuf, buf->private);
 }
-static struct pipe_buf_operations relay_pipe_buf_ops = {
+static const struct pipe_buf_operations relay_pipe_buf_ops = {
        .can_merge = 0,
        .map = generic_pipe_buf_map,
        .unmap = generic_pipe_buf_unmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b959..af96c1e4b54 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
+        struct resource tmp = *new;
-        new->start = root->start;
+        tmp.start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
-         * of this->start - 1 to new->end below would cause an underflow.
+         * of this->start - 1 to tmp->end below would cause an underflow.
         */
        if (this && this->start == 0) {
-                new->start = this->end + 1;
+                tmp.start = this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        new->end = this->start - 1;
+                        tmp.end = this->start - 1;
                else
-                        new->end = root->end;
+                        tmp.end = root->end;
-                if (new->start < min)
+                if (tmp.start < min)
-                        new->start = min;
+                        tmp.start = min;
-                if (new->end > max)
+                if (tmp.end > max)
-                        new->end = max;
+                        tmp.end = max;
-                new->start = ALIGN(new->start, align);
+                tmp.start = ALIGN(tmp.start, align);
                if (alignf)
-                        alignf(alignf_data, new, size, align);
+                        alignf(alignf_data, &tmp, size, align);
-                if (new->start < new->end && new->end - new->start >= size - 1) {
+                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
-                        new->end = new->start + size - 1;
+                        new->start = tmp.start;
+                        new->end = tmp.start + size - 1;
                        return 0;
                }
                if (!this)
                        break;
-                new->start = this->end + 1;
+                tmp.start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e..ddabb54bb5c 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do {								\
        if (rt_trace_on) {                                      \
                rt_trace_on = 0;                                \
                console_verbose();                              \
-                if (spin_is_locked(&current->pi_lock))          \
+                if (raw_spin_is_locked(&current->pi_lock))      \
-                        spin_unlock(&current->pi_lock);         \
+                        raw_spin_unlock(&current->pi_lock);     \
        }                                                       \
 } while (0)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e7..a9604815786 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 {
        unsigned long flags;
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        __rt_mutex_adjust_prio(task);
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 }
 /*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /*
         * Task can not go away as we did a get_task() before !
         */
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        waiter = task->pi_blocked_on;
        /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                goto out_unlock_pi;
        lock = waiter->lock;
-        if (!spin_trylock(&lock->wait_lock)) {
+        if (!raw_spin_trylock(&lock->wait_lock)) {
-                spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                cpu_relax();
                goto retry;
        }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /* Deadlock detection */
        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
                debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                ret = deadlock_detect ? -EDEADLK : 0;
                goto out_unlock_pi;
        }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        plist_add(&waiter->list_entry, &lock->wait_list);
        /* Release the task */
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        put_task_struct(task);
        /* Grab the next task */
        task = rt_mutex_owner(lock);
        get_task_struct(task);
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
                /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                __rt_mutex_adjust_prio(task);
        }
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        top_waiter = rt_mutex_top_waiter(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        if (!detect_deadlock && waiter != top_waiter)
                goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        goto again;
 out_unlock_pi:
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 out_put_task:
        put_task_struct(task);
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
        if (pendowner == task)
                return 1;
-        spin_lock_irqsave(&pendowner->pi_lock, flags);
+        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
        if (task->prio >= pendowner->prio) {
-                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 0;
        }
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
         * priority.
         */
        if (likely(!rt_mutex_has_waiters(lock))) {
-                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 1;
        }
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
        next = rt_mutex_top_waiter(lock);
        plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
        __rt_mutex_adjust_prio(pendowner);
-        spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
        /*
         * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
         * might be task:
         */
        if (likely(next->task != task)) {
-                spin_lock_irqsave(&task->pi_lock, flags);
+                raw_spin_lock_irqsave(&task->pi_lock, flags);
                plist_add(&next->pi_list_entry, &task->pi_waiters);
                __rt_mutex_adjust_prio(task);
-                spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        }
        return 1;
 }
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0, res;
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        __rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        task->pi_blocked_on = waiter;
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
-                spin_lock_irqsave(&owner->pi_lock, flags);
+                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
                plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-                spin_unlock_irqrestore(&owner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
        else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
                chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
         */
        get_task_struct(owner);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
                                         task);
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        return res;
 }
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
        struct task_struct *pendowner;
        unsigned long flags;
-        spin_lock_irqsave(&current->pi_lock, flags);
+        raw_spin_lock_irqsave(&current->pi_lock, flags);
        waiter = rt_mutex_top_waiter(lock);
        plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
        rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
-        spin_unlock_irqrestore(&current->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
        /*
         * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         * waiter with higher priority than pending-owner->normal_prio
         * is blocked on the unboosted (pending) owner.
         */
-        spin_lock_irqsave(&pendowner->pi_lock, flags);
+        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
        WARN_ON(!pendowner->pi_blocked_on);
        WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
                next = rt_mutex_top_waiter(lock);
                plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
        }
-        spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
        wake_up_process(pendowner);
 }
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0;
-        spin_lock_irqsave(&current->pi_lock, flags);
+        raw_spin_lock_irqsave(&current->pi_lock, flags);
        plist_del(&waiter->list_entry, &lock->wait_list);
        waiter->task = NULL;
        current->pi_blocked_on = NULL;
-        spin_unlock_irqrestore(&current->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
        if (first && owner != current) {
-                spin_lock_irqsave(&owner->pi_lock, flags);
+                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-                spin_unlock_irqrestore(&owner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
        WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(owner);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
 }
 /*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        struct rt_mutex_waiter *waiter;
        unsigned long flags;
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        waiter = task->pi_blocked_on;
        if (!waiter || waiter->list_entry.prio == task->prio) {
-                spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                                break;
                }
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                debug_rt_mutex_print_deadlock(waiter);
                if (waiter->task)
                        schedule_rt_mutex(lock);
-                spin_lock(&lock->wait_lock);
+                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        debug_rt_mutex_init_waiter(&waiter);
        waiter.task = NULL;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        /* Try to acquire the lock again: */
        if (try_to_take_rt_mutex(lock)) {
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                return 0;
        }
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
         */
        fixup_rt_mutex_waiters(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        /* Remove pending timer: */
        if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
        int ret = 0;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        if (likely(rt_mutex_owner(lock) != current)) {
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
                fixup_rt_mutex_waiters(lock);
        }
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        return ret;
 }
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 static void __sched
 rt_mutex_slowunlock(struct rt_mutex *lock)
 {
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        debug_rt_mutex_unlock(lock);
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
        if (!rt_mutex_has_waiters(lock)) {
                lock->owner = NULL;
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                return;
        }
        wakeup_next_waiter(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        /* Undo pi boosting if necessary: */
        rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
        lock->owner = NULL;
-        spin_lock_init(&lock->wait_lock);
+        raw_spin_lock_init(&lock->wait_lock);
-        plist_head_init(&lock->wait_list, &lock->wait_lock);
+        plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
        debug_rt_mutex_init(lock, name);
 }
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 {
        int ret;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        mark_rt_mutex_waiters(lock);
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                /* We got the lock for task. */
                debug_rt_mutex_lock(lock);
                rt_mutex_set_owner(lock, task, 0);
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                rt_mutex_deadlock_account_lock(lock, task);
                return 1;
        }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                 */
                ret = 0;
        }
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        debug_rt_mutex_print_deadlock(waiter);
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 {
        int ret;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        set_current_state(TASK_INTERRUPTIBLE);
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
         */
        fixup_rt_mutex_waiters(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        /*
         * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a25..3a8fb30a91b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
 struct rt_bandwidth {
        /* nests inside the rq lock: */
-        spinlock_t              rt_runtime_lock;
+        raw_spinlock_t          rt_runtime_lock;
        ktime_t                 rt_period;
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
        rt_b->rt_period = ns_to_ktime(period);
        rt_b->rt_runtime = runtime;
-        spin_lock_init(&rt_b->rt_runtime_lock);
+        raw_spin_lock_init(&rt_b->rt_runtime_lock);
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        if (hrtimer_active(&rt_b->rt_period_timer))
                return;
-        spin_lock(&rt_b->rt_runtime_lock);
+        raw_spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
                unsigned long delta;
                ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
                                HRTIMER_MODE_ABS_PINNED, 0);
        }
-        spin_unlock(&rt_b->rt_runtime_lock);
+        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
@@ -470,7 +470,7 @@ struct rt_rq {
        u64 rt_time;
        u64 rt_runtime;
        /* Nests inside the rq lock: */
-        spinlock_t rt_runtime_lock;
+        raw_spinlock_t rt_runtime_lock;
 #ifdef CONFIG_RT_GROUP_SCHED
        unsigned long rt_nr_boosted;
@@ -525,7 +525,7 @@ static struct root_domain def_root_domain;
 */
 struct rq {
        /* runqueue lock: */
-        spinlock_t lock;
+        raw_spinlock_t lock;
        /*
         * nr_running and cpu_load should be in the same cacheline because
@@ -685,7 +685,7 @@ inline void update_rq_clock(struct rq *rq)
 */
 int runqueue_is_locked(int cpu)
 {
-        return spin_is_locked(&cpu_rq(cpu)->lock);
+        return raw_spin_is_locked(&cpu_rq(cpu)->lock);
 }
 /*
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 * default: 0.25ms
 */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
 /*
 * Inject some fuzzyness into changing the per-cpu group shares
@@ -892,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         */
        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -916,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
        next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
 #else
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 #endif
 }
@@ -948,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 {
        for (;;) {
                struct rq *rq = task_rq(p);
-                spin_lock(&rq->lock);
+                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-                spin_unlock(&rq->lock);
+                raw_spin_unlock(&rq->lock);
        }
 }
@@ -968,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        for (;;) {
                local_irq_save(*flags);
                rq = task_rq(p);
-                spin_lock(&rq->lock);
+                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-                spin_unlock_irqrestore(&rq->lock, *flags);
+                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
 }
@@ -980,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p)
        struct rq *rq = task_rq(p);
        smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-        spin_unlock_wait(&rq->lock);
+        raw_spin_unlock_wait(&rq->lock);
 }
 static void __task_rq_unlock(struct rq *rq)
        __releases(rq->lock)
 {
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
        __releases(rq->lock)
 {
-        spin_unlock_irqrestore(&rq->lock, *flags);
+        raw_spin_unlock_irqrestore(&rq->lock, *flags);
 }
 /*
@@ -1005,7 +1006,7 @@ static struct rq *this_rq_lock(void)
        local_irq_disable();
        rq = this_rq();
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        return rq;
 }
@@ -1052,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
        return HRTIMER_NORESTART;
 }
@@ -1068,10 +1069,10 @@ static void __hrtick_start(void *arg)
 {
        struct rq *rq = arg;
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        hrtimer_restart(&rq->hrtick_timer);
        rq->hrtick_csd_pending = 0;
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 }
 /*
@@ -1178,7 +1179,7 @@ static void resched_task(struct task_struct *p)
 {
        int cpu;
-        assert_spin_locked(&task_rq(p)->lock);
+        assert_raw_spin_locked(&task_rq(p)->lock);
        if (test_tsk_need_resched(p))
                return;
@@ -1200,10 +1201,10 @@ static void resched_cpu(int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        if (!spin_trylock_irqsave(&rq->lock, flags))
+        if (!raw_spin_trylock_irqsave(&rq->lock, flags))
                return;
        resched_task(cpu_curr(cpu));
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_NO_HZ
@@ -1272,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
-        assert_spin_locked(&task_rq(p)->lock);
+        assert_raw_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
@@ -1599,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
                struct rq *rq = cpu_rq(cpu);
                unsigned long flags;
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
 }
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long weight, rq_weight = 0, shares = 0;
+        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
        unsigned long *usd_rq_weight;
        struct sched_domain *sd = data;
        unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                weight = tg->cfs_rq[i]->load.weight;
                usd_rq_weight[i] = weight;
+                rq_weight += weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                if (!weight)
                        weight = NICE_0_LOAD;
-                rq_weight += weight;
+                sum_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
+        if (!rq_weight)
+                rq_weight = sum_weight;
        if ((!shares && rq_weight) || shares > tg->shares)
                shares = tg->shares;
@@ -1701,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
        if (root_task_group_empty())
                return;
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
        update_shares(sd);
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
 }
 static void update_h_load(long cpu)
@@ -1743,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
-        spin_unlock(&this_rq->lock);
+        raw_spin_unlock(&this_rq->lock);
        double_rq_lock(this_rq, busiest);
        return 1;
@@ -1764,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
        int ret = 0;
-        if (unlikely(!spin_trylock(&busiest->lock))) {
+        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
-                        spin_unlock(&this_rq->lock);
+                        raw_spin_unlock(&this_rq->lock);
-                        spin_lock(&busiest->lock);
+                        raw_spin_lock(&busiest->lock);
-                        spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                        raw_spin_lock_nested(&this_rq->lock,
+                                              SINGLE_DEPTH_NESTING);
                        ret = 1;
                } else
-                        spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+                        raw_spin_lock_nested(&busiest->lock,
+                                              SINGLE_DEPTH_NESTING);
        }
        return ret;
 }
@@ -1785,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
        if (unlikely(!irqs_disabled())) {
                /* printk() doesn't work good under rq->lock */
-                spin_unlock(&this_rq->lock);
+                raw_spin_unlock(&this_rq->lock);
                BUG_ON(1);
        }
@@ -1795,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
 {
-        spin_unlock(&busiest->lock);
+        raw_spin_unlock(&busiest->lock);
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 #endif
@@ -1810,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);
+static int get_update_sysctl_factor(void);
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+        set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+        /*
+         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+         * successfuly executed on another CPU. We must ensure that updates of
+         * per-task data have been completed by this moment.
+         */
+        smp_wmb();
+        task_thread_info(p)->cpu = cpu;
+#endif
+}
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -1967,20 +1990,6 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
                                       int oldprio, int running)
@@ -1993,39 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio, running);
 }
-/**
- * kthread_bind - bind a just-created kthread to a cpu.
- * @p: thread created by kthread_create().
- * @cpu: cpu (might not be online, must be possible) for @k to run on.
- *
- * Description: This function is equivalent to set_cpus_allowed(),
- * except that @cpu doesn't need to be online, and the thread must be
- * stopped (i.e., just returned from kthread_create()).
- *
- * Function lives here instead of kthread.c because it messes with
- * scheduler internals which require locking.
- */
-void kthread_bind(struct task_struct *p, unsigned int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        /* Must have done schedule() in kthread() before we set_task_cpu */
-        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-                WARN_ON(1);
-                return;
-        }
-        spin_lock_irqsave(&rq->lock, flags);
-        update_rq_clock(rq);
-        set_task_cpu(p, cpu);
-        p->cpus_allowed = cpumask_of_cpu(cpu);
-        p->rt.nr_cpus_allowed = 1;
-        p->flags |= PF_THREAD_BOUND;
-        spin_unlock_irqrestore(&rq->lock, flags);
-}
-EXPORT_SYMBOL(kthread_bind);
 #ifdef CONFIG_SMP
 /*
 * Is this task likely cache-hot:
@@ -2035,6 +2011,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
        s64 delta;
+        if (p->sched_class != &fair_sched_class)
+                return 0;
        /*
         * Buddy candidates are cache hot:
         */
@@ -2043,9 +2022,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
                         &p->se == cfs_rq_of(&p->se)->last))
                return 1;
-        if (p->sched_class != &fair_sched_class)
-                return 0;
        if (sysctl_sched_migration_cost == -1)
                return 1;
        if (sysctl_sched_migration_cost == 0)
@@ -2056,38 +2032,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
-        int old_cpu = task_cpu(p);
+#ifdef CONFIG_SCHED_DEBUG
-        struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
+        /*
-        struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+         * We should never call set_task_cpu() on a blocked task,
-                      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+         * ttwu() will sort out the placement.
-        u64 clock_offset;
+         */
+        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-        clock_offset = old_rq->clock - new_rq->clock;
+                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#endif
        trace_sched_migrate_task(p, new_cpu);
-#ifdef CONFIG_SCHEDSTATS
+        if (task_cpu(p) != new_cpu) {
-        if (p->se.wait_start)
-                p->se.wait_start -= clock_offset;
-        if (p->se.sleep_start)
-                p->se.sleep_start -= clock_offset;
-        if (p->se.block_start)
-                p->se.block_start -= clock_offset;
-#endif
-        if (old_cpu != new_cpu) {
                p->se.nr_migrations++;
-#ifdef CONFIG_SCHEDSTATS
+                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
-                if (task_hot(p, old_rq->clock, NULL))
-                        schedstat_inc(p, se.nr_forced2_migrations);
-#endif
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
-                                     1, 1, NULL, 0);
        }
-        p->se.vruntime -= old_cfsrq->min_vruntime -
-                                         new_cfsrq->min_vruntime;
        __set_task_cpu(p, new_cpu);
 }
@@ -2112,13 +2073,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
        /*
         * If the task is not on a runqueue (and not running), then
-         * it is sufficient to simply update the task's cpu field.
+         * the next wake-up will properly place the task.
         */
-        if (!p->se.on_rq && !task_running(rq, p)) {
+        if (!p->se.on_rq && !task_running(rq, p))
-                update_rq_clock(rq);
-                set_task_cpu(p, dest_cpu);
                return 0;
-        }
        init_completion(&req->done);
        req->task = p;
@@ -2323,6 +2281,75 @@ void task_oncpu_function_call(struct task_struct *p,
        preempt_enable();
 }
+#ifdef CONFIG_SMP
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+        int dest_cpu;
+        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        /* Look for allowed, online CPU in same node. */
+        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                        return dest_cpu;
+        /* Any allowed, online CPU? */
+        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+        if (dest_cpu < nr_cpu_ids)
+                return dest_cpu;
+        /* No more Mr. Nice Guy. */
+        if (dest_cpu >= nr_cpu_ids) {
+                rcu_read_lock();
+                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+                rcu_read_unlock();
+                dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+                /*
+                 * Don't tell them about moving exiting tasks or
+                 * kernel threads (both mm NULL), since they never
+                 * leave kernel.
+                 */
+                if (p->mm && printk_ratelimit()) {
+                        printk(KERN_INFO "process %d (%s) no "
+                               "longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
+                }
+        }
+        return dest_cpu;
+}
+/*
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
+ *
+ *  exec:           is unstable, retry loop
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ */
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        /*
+         * In order not to call set_task_cpu() on a blocking task we need
+         * to rely on ttwu() to place the task on a valid ->cpus_allowed
+         * cpu.
+         *
+         * Since this is common to all placement strategies, this lives here.
+         *
+         * [ this allows ->select_task() to simply return task_cpu(p) and
+         *   not worry about this generic constraint ]
+         */
+        if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+                     !cpu_online(cpu)))
+                cpu = select_fallback_rq(task_cpu(p), p);
+        return cpu;
+}
+#endif
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -2374,17 +2401,18 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
        p->state = TASK_WAKING;
-        task_rq_unlock(rq, &flags);
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        if (p->sched_class->task_waking)
-        if (cpu != orig_cpu) {
+                p->sched_class->task_waking(rq, p);
-                local_irq_save(flags);
-                rq = cpu_rq(cpu);
+        __task_rq_unlock(rq);
-                update_rq_clock(rq);
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        if (cpu != orig_cpu)
                set_task_cpu(p, cpu);
-                local_irq_restore(flags);
-        }
+        rq = __task_rq_lock(p);
-        rq = task_rq_lock(p, &flags);
+        update_rq_clock(rq);
        WARN_ON(p->state != TASK_WAKING);
        cpu = task_cpu(p);
@@ -2440,8 +2468,8 @@ out_running:
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-        if (p->sched_class->task_wake_up)
+        if (p->sched_class->task_woken)
-                p->sched_class->task_wake_up(rq, p);
+                p->sched_class->task_woken(rq, p);
        if (unlikely(rq->idle_stamp)) {
                u64 delta = rq->clock - rq->idle_stamp;
@@ -2499,7 +2527,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
-        p->se.avg_running               = 0;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                        = 0;
@@ -2521,7 +2548,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
@@ -2542,14 +2568,6 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
-        /*
-         * We mark the process as running here, but have not actually
-         * inserted it onto the runqueue yet. This guarantees that
-         * nobody will actually run it, and a signal or other external
-         * event cannot wake it up and insert it on the runqueue either.
-         */
-        p->state = TASK_RUNNING;
 }
 /*
@@ -2558,9 +2576,14 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p, int clone_flags)
 {
        int cpu = get_cpu();
-        unsigned long flags;
        __sched_fork(p);
+        /*
+         * We mark the process as waking here. This guarantees that
+         * nobody will actually run it, and a signal or other external
+         * event cannot wake it up and insert it on the runqueue either.
+         */
+        p->state = TASK_WAKING;
        /*
         * Revert to default priority/policy on fork if requested.
@@ -2592,13 +2615,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
-#ifdef CONFIG_SMP
+        if (p->sched_class->task_fork)
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+                p->sched_class->task_fork(p);
-#endif
-        local_irq_save(flags);
-        update_rq_clock(cpu_rq(cpu));
        set_task_cpu(p, cpu);
-        local_irq_restore(flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
@@ -2627,28 +2647,35 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
+        int cpu = get_cpu();
+#ifdef CONFIG_SMP
+        /*
+         * Fork balancing, do it here and not earlier because:
+         *  - cpus_allowed can change in the fork path
+         *  - any previously selected cpu might disappear through hotplug
+         *
+         * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+         * ->cpus_allowed is stable, we have preemption disabled, meaning
+         * cpu_online_mask is stable.
+         */
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, cpu);
+#endif
        rq = task_rq_lock(p, &flags);
-        BUG_ON(p->state != TASK_RUNNING);
+        BUG_ON(p->state != TASK_WAKING);
+        p->state = TASK_RUNNING;
        update_rq_clock(rq);
+        activate_task(rq, p, 0);
-        if (!p->sched_class->task_new || !current->se.on_rq) {
-                activate_task(rq, p, 0);
-        } else {
-                /*
-                 * Let the scheduling class do new task startup
-                 * management (if any):
-                 */
-                p->sched_class->task_new(rq, p);
-                inc_nr_running(rq);
-        }
        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-        if (p->sched_class->task_wake_up)
+        if (p->sched_class->task_woken)
-                p->sched_class->task_wake_up(rq, p);
+                p->sched_class->task_woken(rq, p);
 #endif
        task_rq_unlock(rq, &flags);
+        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2798,10 +2825,10 @@ static inline void post_schedule(struct rq *rq)
        if (rq->post_schedule) {
                unsigned long flags;
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->curr->sched_class->post_schedule)
                        rq->curr->sched_class->post_schedule(rq);
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                rq->post_schedule = 0;
        }
@@ -3083,15 +3110,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 {
        BUG_ON(!irqs_disabled());
        if (rq1 == rq2) {
-                spin_lock(&rq1->lock);
+                raw_spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
        } else {
                if (rq1 < rq2) {
-                        spin_lock(&rq1->lock);
+                        raw_spin_lock(&rq1->lock);
-                        spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
                } else {
-                        spin_lock(&rq2->lock);
+                        raw_spin_lock(&rq2->lock);
-                        spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
        update_rq_clock(rq1);
@@ -3108,29 +3135,44 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
        __releases(rq1->lock)
        __releases(rq2->lock)
 {
-        spin_unlock(&rq1->lock);
+        raw_spin_unlock(&rq1->lock);
        if (rq1 != rq2)
-                spin_unlock(&rq2->lock);
+                raw_spin_unlock(&rq2->lock);
        else
                __release(rq2->lock);
 }
 /*
- * If dest_cpu is allowed for this process, migrate the task to it.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
- * This is accomplished by forcing the cpu_allowed mask to only
+ * this point the task has the smallest effective memory and cache footprint.
- * allow dest_cpu, which will force the cpu onto dest_cpu. Then
- * the cpu_allowed mask is restored.
 */
-static void sched_migrate_task(struct task_struct *p, int dest_cpu)
+void sched_exec(void)
 {
+        struct task_struct *p = current;
        struct migration_req req;
+        int dest_cpu, this_cpu;
        unsigned long flags;
        struct rq *rq;
+again:
+        this_cpu = get_cpu();
+        dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
+        if (dest_cpu == this_cpu) {
+                put_cpu();
+                return;
+        }
        rq = task_rq_lock(p, &flags);
+        put_cpu();
+        /*
+         * select_task_rq() can race against ->cpus_allowed
+         */
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-            || unlikely(!cpu_active(dest_cpu)))
+            || unlikely(!cpu_active(dest_cpu))) {
-                goto out;
+                task_rq_unlock(rq, &flags);
+                goto again;
+        }
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
@@ -3145,24 +3187,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
                return;
        }
-out:
        task_rq_unlock(rq, &flags);
 }
 /*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-        int new_cpu, this_cpu = get_cpu();
-        new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
-        put_cpu();
-        if (new_cpu != this_cpu)
-                sched_migrate_task(current, new_cpu);
-}
-/*
 * pull_task - move a task from a remote runqueue to the local runqueue.
 * Both runqueues must be locked.
 */
@@ -3172,10 +3200,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        deactivate_task(src_rq, p, 0);
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
-        /*
-         * Note that idle threads have a prio of MAX_PRIO, for this test
-         * to be always true for them.
-         */
        check_preempt_curr(this_rq, p, 0);
 }
@@ -4134,7 +4158,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_online_mask);
+        cpumask_copy(cpus, cpu_active_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4207,14 +4231,15 @@ redo:
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        spin_lock_irqsave(&busiest->lock, flags);
+                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
                        if (!cpumask_test_cpu(this_cpu,
                                              &busiest->curr->cpus_allowed)) {
-                                spin_unlock_irqrestore(&busiest->lock, flags);
+                                raw_spin_unlock_irqrestore(&busiest->lock,
+                                                            flags);
                                all_pinned = 1;
                                goto out_one_pinned;
                        }
@@ -4224,7 +4249,7 @@ redo:
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
-                        spin_unlock_irqrestore(&busiest->lock, flags);
+                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
                        if (active_balance)
                                wake_up_process(busiest->migration_thread);
@@ -4297,7 +4322,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        int all_pinned = 0;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_online_mask);
+        cpumask_copy(cpus, cpu_active_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4406,10 +4431,10 @@ redo:
                /*
                 * Should not call ttwu while holding a rq->lock
                 */
-                spin_unlock(&this_rq->lock);
+                raw_spin_unlock(&this_rq->lock);
                if (active_balance)
                        wake_up_process(busiest->migration_thread);
-                spin_lock(&this_rq->lock);
+                raw_spin_lock(&this_rq->lock);
        } else
                sd->nr_balance_failed = 0;
@@ -4694,7 +4719,7 @@ int select_nohz_load_balancer(int stop_tick)
                cpumask_set_cpu(cpu, nohz.cpu_mask);
                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
@@ -5278,11 +5303,11 @@ void scheduler_tick(void)
        sched_clock_tick();
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
        perf_event_task_tick(curr, cpu);
@@ -5396,13 +5421,14 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
-static void put_prev_task(struct rq *rq, struct task_struct *p)
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
+        if (prev->state == TASK_RUNNING) {
+                u64 runtime = prev->se.sum_exec_runtime;
-        update_avg(&p->se.avg_running, runtime);
+                runtime -= prev->se.prev_sum_exec_runtime;
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-        if (p->state == TASK_RUNNING) {
                /*
                 * In order to avoid avg_overlap growing stale when we are
                 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +5438,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
                 * correlates to the amount of cache footprint a task can
                 * build up.
                 */
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                update_avg(&prev->se.avg_overlap, runtime);
-                update_avg(&p->se.avg_overlap, runtime);
-        } else {
-                update_avg(&p->se.avg_running, 0);
        }
-        p->sched_class->put_prev_task(rq, p);
+        prev->sched_class->put_prev_task(rq, prev);
 }
 /*
@@ -5478,7 +5501,7 @@ need_resched_nonpreemptible:
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
-        spin_lock_irq(&rq->lock);
+        raw_spin_lock_irq(&rq->lock);
        update_rq_clock(rq);
        clear_tsk_need_resched(prev);
@@ -5514,12 +5537,15 @@ need_resched_nonpreemptible:
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
        } else
-                spin_unlock_irq(&rq->lock);
+                raw_spin_unlock_irq(&rq->lock);
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(current) < 0))
+        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+                prev = rq->curr;
+                switch_count = &prev->nivcsw;
                goto need_resched_nonpreemptible;
+        }
        preempt_enable_no_resched();
        if (need_resched())
@@ -5931,14 +5957,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 */
 bool try_wait_for_completion(struct completion *x)
 {
+        unsigned long flags;
        int ret = 1;
-        spin_lock_irq(&x->wait.lock);
+        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
        else
                x->done--;
-        spin_unlock_irq(&x->wait.lock);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
@@ -5953,12 +5980,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
+        unsigned long flags;
        int ret = 1;
-        spin_lock_irq(&x->wait.lock);
+        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
-        spin_unlock_irq(&x->wait.lock);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(completion_done);
@@ -6343,7 +6371,7 @@ recheck:
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
         */
-        spin_lock_irqsave(&p->pi_lock, flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        /*
         * To be able to change p->policy safely, the apropriate
         * runqueue lock must be held.
@@ -6353,7 +6381,7 @@ recheck:
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
                __task_rq_unlock(rq);
-                spin_unlock_irqrestore(&p->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
        update_rq_clock(rq);
@@ -6377,7 +6405,7 @@ recheck:
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
        __task_rq_unlock(rq);
-        spin_unlock_irqrestore(&p->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        rt_mutex_adjust_pi(p);
@@ -6477,7 +6505,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
                return -EINVAL;
        retval = -ESRCH;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (p) {
                retval = security_task_getscheduler(p);
@@ -6485,7 +6513,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
                        retval = p->policy
                                | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -6503,7 +6531,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (!param || pid < 0)
                return -EINVAL;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        retval = -ESRCH;
        if (!p)
@@ -6514,7 +6542,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
                goto out_unlock;
        lp.sched_priority = p->rt_priority;
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        /*
         * This one might sleep, we cannot do it with a spinlock held ...
@@ -6524,7 +6552,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        return retval;
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -6535,22 +6563,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        int retval;
        get_online_cpus();
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (!p) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                put_online_cpus();
                return -ESRCH;
        }
-        /*
+        /* Prevent p going away */
-         * It is not safe to call set_cpus_allowed with the
-         * tasklist_lock held. We will bump the task_struct's
-         * usage count and then drop tasklist_lock.
-         */
        get_task_struct(p);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                retval = -ENOMEM;
@@ -6631,10 +6655,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        get_online_cpus();
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
@@ -6645,10 +6671,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
+        rq = task_rq_lock(p, &flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        task_rq_unlock(rq, &flags);
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        put_online_cpus();
        return retval;
@@ -6703,7 +6731,7 @@ SYSCALL_DEFINE0(sched_yield)
         */
        __release(rq->lock);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-        _raw_spin_unlock(&rq->lock);
+        do_raw_spin_unlock(&rq->lock);
        preempt_enable_no_resched();
        schedule();
@@ -6883,6 +6911,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
        struct task_struct *p;
        unsigned int time_slice;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        struct timespec t;
@@ -6890,7 +6920,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                return -EINVAL;
        retval = -ESRCH;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (!p)
                goto out_unlock;
@@ -6899,15 +6929,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        if (retval)
                goto out_unlock;
-        time_slice = p->sched_class->get_rr_interval(p);
+        rq = task_rq_lock(p, &flags);
+        time_slice = p->sched_class->get_rr_interval(rq, p);
+        task_rq_unlock(rq, &flags);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
        return retval;
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -6995,12 +7027,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __sched_fork(idle);
+        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        idle->prio = idle->normal_prio = MAX_PRIO;
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
@@ -7008,7 +7040,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
        idle->oncpu = 1;
 #endif
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT)
@@ -7041,22 +7073,43 @@ cpumask_var_t nohz_cpu_mask;
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
-static inline void sched_init_granularity(void)
+static int get_update_sysctl_factor(void)
 {
-        unsigned int factor = 1 + ilog2(num_online_cpus());
+        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        const unsigned long limit = 200000000;
+        unsigned int factor;
-        sysctl_sched_min_granularity *= factor;
+        switch (sysctl_sched_tunable_scaling) {
-        if (sysctl_sched_min_granularity > limit)
+        case SCHED_TUNABLESCALING_NONE:
-                sysctl_sched_min_granularity = limit;
+                factor = 1;
+                break;
+        case SCHED_TUNABLESCALING_LINEAR:
+                factor = cpus;
+                break;
+        case SCHED_TUNABLESCALING_LOG:
+        default:
+                factor = 1 + ilog2(cpus);
+                break;
+        }
-        sysctl_sched_latency *= factor;
+        return factor;
-        if (sysctl_sched_latency > limit)
+}
-                sysctl_sched_latency = limit;
-        sysctl_sched_wakeup_granularity *= factor;
+static void update_sysctl(void)
+{
+        unsigned int factor = get_update_sysctl_factor();
-        sysctl_sched_shares_ratelimit *= factor;
+#define SET_SYSCTL(name) \
+        (sysctl_##name = (factor) * normalized_sysctl_##name)
+        SET_SYSCTL(sched_min_granularity);
+        SET_SYSCTL(sched_latency);
+        SET_SYSCTL(sched_wakeup_granularity);
+        SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}
+static inline void sched_init_granularity(void)
+{
+        update_sysctl();
 }
 #ifdef CONFIG_SMP
@@ -7092,8 +7145,28 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        struct rq *rq;
        int ret = 0;
+        /*
+         * Since we rely on wake-ups to migrate sleeping tasks, don't change
+         * the ->cpus_allowed mask from under waking tasks, which would be
+         * possible when we change rq->lock in ttwu(), so synchronize against
+         * TASK_WAKING to avoid that.
+         *
+         * Make an exception for freshly cloned tasks, since cpuset namespaces
+         * might move the task about, we have to validate the target in
+         * wake_up_new_task() anyway since the cpu might have gone away.
+         */
+again:
+        while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
+                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (!cpumask_intersects(new_mask, cpu_online_mask)) {
+        if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
+                task_rq_unlock(rq, &flags);
+                goto again;
+        }
+        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
@@ -7115,7 +7188,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
+        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                struct task_struct *mt = rq->migration_thread;
@@ -7148,7 +7221,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
        struct rq *rq_dest, *rq_src;
-        int ret = 0, on_rq;
+        int ret = 0;
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
@@ -7164,12 +7237,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
-        on_rq = p->se.on_rq;
+        /*
-        if (on_rq)
+         * If we're not on a rq, the next wake-up will ensure we're
+         * placed properly.
+         */
+        if (p->se.on_rq) {
                deactivate_task(rq_src, p, 0);
+                set_task_cpu(p, dest_cpu);
-        set_task_cpu(p, dest_cpu);
-        if (on_rq) {
                activate_task(rq_dest, p, 0);
                check_preempt_curr(rq_dest, p, 0);
        }
@@ -7204,10 +7278,10 @@ static int migration_thread(void *data)
                struct migration_req *req;
                struct list_head *head;
-                spin_lock_irq(&rq->lock);
+                raw_spin_lock_irq(&rq->lock);
                if (cpu_is_offline(cpu)) {
-                        spin_unlock_irq(&rq->lock);
+                        raw_spin_unlock_irq(&rq->lock);
                        break;
                }
@@ -7219,7 +7293,7 @@ static int migration_thread(void *data)
                head = &rq->migration_queue;
                if (list_empty(head)) {
-                        spin_unlock_irq(&rq->lock);
+                        raw_spin_unlock_irq(&rq->lock);
                        schedule();
                        set_current_state(TASK_INTERRUPTIBLE);
                        continue;
@@ -7228,14 +7302,14 @@ static int migration_thread(void *data)
                list_del_init(head->next);
                if (req->task != NULL) {
-                        spin_unlock(&rq->lock);
+                        raw_spin_unlock(&rq->lock);
                        __migrate_task(req->task, cpu, req->dest_cpu);
                } else if (likely(cpu == (badcpu = smp_processor_id()))) {
                        req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                        spin_unlock(&rq->lock);
+                        raw_spin_unlock(&rq->lock);
                } else {
                        req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                        spin_unlock(&rq->lock);
+                        raw_spin_unlock(&rq->lock);
                        WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
                }
                local_irq_enable();
@@ -7265,37 +7339,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
        int dest_cpu;
-        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
 again:
-        /* Look for allowed, online CPU in same node. */
+        dest_cpu = select_fallback_rq(dead_cpu, p);
-        for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
-                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
-                        goto move;
-        /* Any allowed, online CPU? */
-        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
-        if (dest_cpu < nr_cpu_ids)
-                goto move;
-        /* No more Mr. Nice Guy. */
-        if (dest_cpu >= nr_cpu_ids) {
-                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-                dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
-                /*
-                 * Don't tell them about moving exiting tasks or
-                 * kernel threads (both mm NULL), since they never
-                 * leave kernel.
-                 */
-                if (p->mm && printk_ratelimit()) {
-                        printk(KERN_INFO "process %d (%s) no "
-                               "longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, dead_cpu);
-                }
-        }
-move:
        /* It can have affinity changed while we were choosing. */
        if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
                goto again;
@@ -7310,7 +7357,7 @@ move:
 */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
+        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
        unsigned long flags;
        local_irq_save(flags);
@@ -7358,14 +7405,14 @@ void sched_idle_next(void)
         * Strictly not necessary since rest of the CPUs are stopped by now
         * and interrupts disabled on the current cpu.
         */
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
        update_rq_clock(rq);
        activate_task(rq, p, 0);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -7401,9 +7448,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
         * that's OK. No task can be added to this CPU, so iteration is
         * fine.
         */
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
        move_task_off_dead_cpu(dead_cpu, p);
-        spin_lock_irq(&rq->lock);
+        raw_spin_lock_irq(&rq->lock);
        put_task_struct(p);
 }
@@ -7563,7 +7610,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
-        int i, cpu_num = num_online_cpus();
+        int i, cpu_num = num_possible_cpus();
        struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
        char buf[32];
@@ -7573,7 +7620,7 @@ static void register_sched_domain_sysctl(void)
        if (entry == NULL)
                return;
-        for_each_online_cpu(i) {
+        for_each_possible_cpu(i) {
                snprintf(buf, 32, "cpu%d", i);
                entry->procname = kstrdup(buf, GFP_KERNEL);
                entry->mode = 0555;
@@ -7669,13 +7716,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_online(rq);
                }
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -7700,14 +7747,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                put_task_struct(rq->migration_thread);
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
-                spin_lock_irq(&rq->lock);
+                raw_spin_lock_irq(&rq->lock);
                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
-                rq->idle->static_prio = MAX_PRIO;
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
-                spin_unlock_irq(&rq->lock);
+                raw_spin_unlock_irq(&rq->lock);
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
@@ -7717,30 +7763,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 * they didn't take sched_hotcpu_mutex. Just wake up
                 * the requestors.
                 */
-                spin_lock_irq(&rq->lock);
+                raw_spin_lock_irq(&rq->lock);
                while (!list_empty(&rq->migration_queue)) {
                        struct migration_req *req;
                        req = list_entry(rq->migration_queue.next,
                                         struct migration_req, list);
                        list_del_init(&req->list);
-                        spin_unlock_irq(&rq->lock);
+                        raw_spin_unlock_irq(&rq->lock);
                        complete(&req->done);
-                        spin_lock_irq(&rq->lock);
+                        raw_spin_lock_irq(&rq->lock);
                }
-                spin_unlock_irq(&rq->lock);
+                raw_spin_unlock_irq(&rq->lock);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
                rq = cpu_rq(cpu);
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                break;
 #endif
        }
@@ -7970,7 +8016,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        struct root_domain *old_rd = NULL;
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        if (rq->rd) {
                old_rd = rq->rd;
@@ -7996,7 +8042,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        if (old_rd)
                free_rootdomain(old_rd);
@@ -8282,14 +8328,14 @@ enum s_alloc {
 */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
 static int
 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
                 struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-                *sg = &per_cpu(sched_group_cpus, cpu).sg;
+                *sg = &per_cpu(sched_groups, cpu).sg;
        return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -9099,7 +9145,7 @@ match1:
        if (doms_new == NULL) {
                ndoms_cur = 0;
                doms_new = &fallback_doms;
-                cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
+                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
@@ -9230,8 +9276,10 @@ static int update_sched_domains(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
@@ -9278,7 +9326,7 @@ void __init sched_init_smp(void)
 #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_online_mask);
+        arch_init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9351,13 +9399,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-        plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
+        plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
 #endif
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
-        spin_lock_init(&rt_rq->rt_runtime_lock);
+        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 #ifdef CONFIG_RT_GROUP_SCHED
        rt_rq->rt_nr_boosted = 0;
@@ -9517,7 +9565,7 @@ void __init sched_init(void)
                struct rq *rq;
                rq = cpu_rq(i);
-                spin_lock_init(&rq->lock);
+                raw_spin_lock_init(&rq->lock);
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9577,7 +9625,7 @@ void __init sched_init(void)
 #elif defined CONFIG_USER_SCHED
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
                init_tg_rt_entry(&init_task_group,
-                                &per_cpu(init_rt_rq, i),
+                                &per_cpu(init_rt_rq_var, i),
                                &per_cpu(init_sched_rt_entity, i), i, 1,
                                root_task_group.rt_se[i]);
 #endif
@@ -9615,7 +9663,7 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+        plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
        /*
@@ -9659,7 +9707,7 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-        int nested = preempt_count() & ~PREEMPT_ACTIVE;
+        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
@@ -9740,13 +9788,13 @@ void normalize_rt_tasks(void)
                        continue;
                }
-                spin_lock(&p->pi_lock);
+                raw_spin_lock(&p->pi_lock);
                rq = __task_rq_lock(p);
                normalize_task(rq, p);
                __task_rq_unlock(rq);
-                spin_unlock(&p->pi_lock);
+                raw_spin_unlock(&p->pi_lock);
        } while_each_thread(g, p);
        read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9842,13 +9890,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                se = kzalloc_node(sizeof(struct sched_entity),
                                  GFP_KERNEL, cpu_to_node(i));
                if (!se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(cfs_rq);
 err:
        return 0;
 }
@@ -9930,13 +9980,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(rt_rq);
 err:
        return 0;
 }
@@ -10070,7 +10122,7 @@ void sched_move_task(struct task_struct *tsk)
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->moved_group)
-                tsk->sched_class->moved_group(tsk);
+                tsk->sched_class->moved_group(tsk, on_rq);
 #endif
        if (unlikely(running))
@@ -10105,9 +10157,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        struct rq *rq = cfs_rq->rq;
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __set_se_shares(se, shares);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -10292,18 +10344,18 @@ static int tg_set_bandwidth(struct task_group *tg,
        if (err)
                goto unlock;
-        spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
        tg->rt_bandwidth.rt_runtime = rt_runtime;
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = tg->rt_rq[i];
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = rt_runtime;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
-        spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
@@ -10408,15 +10460,15 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_runtime == 0)
                return -EBUSY;
-        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = global_rt_runtime();
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
-        spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
        return 0;
 }
@@ -10707,9 +10759,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
        /*
         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
         */
-        spin_lock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
        data = *cpuusage;
-        spin_unlock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
        data = *cpuusage;
 #endif
@@ -10725,9 +10777,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
        /*
         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
         */
-        spin_lock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
        *cpuusage = val;
-        spin_unlock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
        *cpuusage = val;
 #endif
@@ -10961,9 +11013,9 @@ void synchronize_sched_expedited(void)
                init_completion(&req->done);
                req->task = NULL;
                req->dest_cpu = RCU_MIGRATION_NEED_QS;
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                list_add(&req->list, &rq->migration_queue);
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                wake_up_process(rq->migration_thread);
        }
        for_each_online_cpu(cpu) {
@@ -10971,11 +11023,11 @@ void synchronize_sched_expedited(void)
                req = &per_cpu(rcu_migration_req, cpu);
                rq = cpu_rq(cpu);
                wait_for_completion(&req->done);
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
                        need_full_sync = 1;
                req->dest_cpu = RCU_MIGRATION_IDLE;
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
        rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
        synchronize_sched_expedited_count++;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7..5b496132c28 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+unsigned long long cpu_clock(int cpu)
+{
+        unsigned long long clock;
+        unsigned long flags;
+        local_irq_save(flags);
+        clock = sched_clock_cpu(cpu);
+        local_irq_restore(flags);
+        return clock;
+}
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
        return sched_clock();
 }
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 unsigned long long cpu_clock(int cpu)
 {
-        unsigned long long clock;
+        return sched_clock_cpu(cpu);
-        unsigned long flags;
+}
-        local_irq_save(flags);
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-        clock = sched_clock_cpu(cpu);
-        local_irq_restore(flags);
-        return clock;
-}
 EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d..597b33099df 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
        if (likely(newpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                spin_lock_irqsave(&vec->lock, flags);
+                raw_spin_lock_irqsave(&vec->lock, flags);
                cpumask_set_cpu(cpu, vec->mask);
                vec->count++;
                if (vec->count == 1)
                        set_bit(newpri, cp->pri_active);
-                spin_unlock_irqrestore(&vec->lock, flags);
+                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        if (likely(oldpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                spin_lock_irqsave(&vec->lock, flags);
+                raw_spin_lock_irqsave(&vec->lock, flags);
                vec->count--;
                if (!vec->count)
                        clear_bit(oldpri, cp->pri_active);
                cpumask_clear_cpu(cpu, vec->mask);
-                spin_unlock_irqrestore(&vec->lock, flags);
+                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        *currpri = newpri;
@@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                spin_lock_init(&vec->lock);
+                raw_spin_lock_init(&vec->lock);
                vec->count = 0;
                if (!zalloc_cpumask_var(&vec->mask, gfp))
                        goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fb..7cb5bb6b95b 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
 /* values 2-101 are RT priorities 0-99 */
 struct cpupri_vec {
-        spinlock_t lock;
+        raw_spinlock_t lock;
        int        count;
        cpumask_var_t mask;
 };
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f70..67f95aada4b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        if (cfs_rq->rb_leftmost)
                MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
        last = __pick_last_entity(cfs_rq);
@@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                max_vruntime = last->vruntime;
        min_vruntime = cfs_rq->min_vruntime;
        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                        SPLIT_NS(MIN_vruntime));
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
        print_rq(m, rq, cpu);
 }
+static const char *sched_tunable_scaling_names[] = {
+        "none",
+        "logaritmic",
+        "linear"
+};
 static int sched_debug_show(struct seq_file *m, void *v)
 {
        u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
+        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+                sysctl_sched_tunable_scaling,
+                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
        for_each_online_cpu(cpu)
                print_cpu(m, cpu);
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.sum_exec_runtime);
        PN(se.avg_overlap);
        PN(se.avg_wakeup);
-        PN(se.avg_running);
        nr_switches = p->nvcsw + p->nivcsw;
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.nr_failed_migrations_running);
        P(se.nr_failed_migrations_hot);
        P(se.nr_forced_migrations);
-        P(se.nr_forced2_migrations);
        P(se.nr_wakeups);
        P(se.nr_wakeups_sync);
        P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
        p->se.nr_wakeups_migrate                = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336..8fe7ee81c55 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
 }
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
+#define ENQUEUE_WAKEUP  1
+#define ENQUEUE_MIGRATE 2
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
+         * Update the normalized vruntime before updating min_vruntime
+         * through callig update_curr().
+         */
+        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+                se->vruntime += cfs_rq->min_vruntime;
+        /*
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
        account_entity_enqueue(cfs_rq, se);
-        if (wakeup) {
+        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
                enqueue_sleeper(cfs_rq, se);
        }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+        /*
+         * Normalize the entity after updating the min_vruntime because the
+         * update can refer to the ->curr item and we need to reflect this
+         * movement in our normalized position.
+         */
+        if (!sleep)
+                se->vruntime -= cfs_rq->min_vruntime;
 }
 /*
@@ -1013,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        int flags = 0;
+        if (wakeup)
+                flags |= ENQUEUE_WAKEUP;
+        if (p->state == TASK_WAKING)
+                flags |= ENQUEUE_MIGRATE;
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
                cfs_rq = cfs_rq_of(se);
-                enqueue_entity(cfs_rq, se, wakeup);
+                enqueue_entity(cfs_rq, se, flags);
-                wakeup = 1;
+                flags = ENQUEUE_WAKEUP;
        }
        hrtick_update(rq);
@@ -1095,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)
 #ifdef CONFIG_SMP
+static void task_waking_fair(struct rq *rq, struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        se->vruntime -= cfs_rq->min_vruntime;
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
 * effective_load() calculates the load change as seen from the root_task_group
@@ -1403,8 +1461,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
+                if (!(tmp->flags & SD_LOAD_BALANCE))
+                        continue;
                /*
                 * If power savings logic is enabled for a domain, see if we
                 * are not overloaded, if so, don't balance wider.
@@ -1448,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                         * If there's an idle sibling in this domain, make that
                         * the wake_affine target instead of the current cpu.
                         */
-                        if (tmp->flags & SD_PREFER_SIBLING)
+                        if (tmp->flags & SD_SHARE_PKG_RESOURCES)
                                target = select_idle_sibling(p, tmp, target);
                        if (target >= 0) {
@@ -1484,10 +1544,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1528,8 +1586,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1651,12 +1707,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)))
+                goto preempt;
-        if (unlikely(rt_prio(p->prio))) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1682,50 +1734,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
+        if (wakeup_preempt_entity(se, pse) == 1)
+                goto preempt;
-        if (wakeup_preempt_entity(se, pse) == 1) {
+        return;
-                resched_task(curr);
-                /*
+preempt:
-                 * Only set the backward buddy when the current task is still
+        resched_task(curr);
-                 * on the rq. This can happen when a wakeup gets interleaved
+        /*
-                 * with schedule on the ->pre_schedule() or idle_balance()
+         * Only set the backward buddy when the current task is still
-                 * point, either of which can * drop the rq lock.
+         * on the rq. This can happen when a wakeup gets interleaved
-                 *
+         * with schedule on the ->pre_schedule() or idle_balance()
-                 * Also, during early boot the idle thread is in the fair class,
+         * point, either of which can * drop the rq lock.
-                 * for obvious reasons its a bad idea to schedule back to it.
+         *
-                 */
+         * Also, during early boot the idle thread is in the fair class,
-                if (unlikely(!se->on_rq || curr == rq->idle))
+         * for obvious reasons its a bad idea to schedule back to it.
-                        return;
+         */
-                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+        if (unlikely(!se->on_rq || curr == rq->idle))
-                        set_last_buddy(se);
+                return;
-        }
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1905,6 +1951,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1922,28 +1979,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
-        sched_info_queued(p);
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1952,7 +2011,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        se->vruntime -= cfs_rq->min_vruntime;
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -2005,30 +2066,27 @@ static void set_curr_task_fair(struct rq *rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p)
+static void moved_group_fair(struct task_struct *p, int on_rq)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
        update_curr(cfs_rq);
-        place_entity(cfs_rq, &p->se, 1);
+        if (!on_rq)
+                place_entity(cfs_rq, &p->se, 1);
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -2052,11 +2110,15 @@ static const struct sched_class fair_sched_class = {
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
+        .rq_online              = rq_online_fair,
+        .rq_offline             = rq_offline_fair,
+        .task_waking            = task_waking_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c..d5059fd761d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 /*
- * Wakeup preemption towards tasks that run short
- */
-SCHED_FEAT(WAKEUP_RUNNING, 0)
-/*
 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
 * the remote end is likely to consume the data we just wrote, and
 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde..5f93b570d38 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
 static void
 dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
 {
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
        dump_stack();
-        spin_lock_irq(&rq->lock);
+        raw_spin_lock_irq(&rq->lock);
 }
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct task_struct *task)
+unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef37841..f48328ac216 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
        weight = cpumask_weight(rd->span);
-        spin_lock(&rt_b->rt_runtime_lock);
+        raw_spin_lock(&rt_b->rt_runtime_lock);
        rt_period = ktime_to_ns(rt_b->rt_period);
        for_each_cpu(i, rd->span) {
                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                if (iter == rt_rq)
                        continue;
-                spin_lock(&iter->rt_runtime_lock);
+                raw_spin_lock(&iter->rt_runtime_lock);
                /*
                 * Either all rqs have inf runtime and there's nothing to steal
                 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                        rt_rq->rt_runtime += diff;
                        more = 1;
                        if (rt_rq->rt_runtime == rt_period) {
-                                spin_unlock(&iter->rt_runtime_lock);
+                                raw_spin_unlock(&iter->rt_runtime_lock);
                                break;
                        }
                }
 next:
-                spin_unlock(&iter->rt_runtime_lock);
+                raw_spin_unlock(&iter->rt_runtime_lock);
        }
-        spin_unlock(&rt_b->rt_runtime_lock);
+        raw_spin_unlock(&rt_b->rt_runtime_lock);
        return more;
 }
@@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq)
                s64 want;
                int i;
-                spin_lock(&rt_b->rt_runtime_lock);
+                raw_spin_lock(&rt_b->rt_runtime_lock);
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                /*
                 * Either we're all inf and nobody needs to borrow, or we're
                 * already disabled and thus have nothing to do, or we have
@@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq)
                if (rt_rq->rt_runtime == RUNTIME_INF ||
                                rt_rq->rt_runtime == rt_b->rt_runtime)
                        goto balanced;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                /*
                 * Calculate the difference between what we started out with
@@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq)
                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
                                continue;
-                        spin_lock(&iter->rt_runtime_lock);
+                        raw_spin_lock(&iter->rt_runtime_lock);
                        if (want > 0) {
                                diff = min_t(s64, iter->rt_runtime, want);
                                iter->rt_runtime -= diff;
@@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq)
                                iter->rt_runtime -= want;
                                want -= want;
                        }
-                        spin_unlock(&iter->rt_runtime_lock);
+                        raw_spin_unlock(&iter->rt_runtime_lock);
                        if (!want)
                                break;
                }
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                /*
                 * We cannot be left wanting - that would mean some runtime
                 * leaked out of the system.
@@ -445,8 +445,8 @@ balanced:
                 * runtime - in which case borrowing doesn't make sense.
                 */
                rt_rq->rt_runtime = RUNTIME_INF;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                spin_unlock(&rt_b->rt_runtime_lock);
+                raw_spin_unlock(&rt_b->rt_runtime_lock);
        }
 }
@@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq)
 {
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __disable_runtime(rq);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static void __enable_runtime(struct rq *rq)
@@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq)
        for_each_leaf_rt_rq(rt_rq, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-                spin_lock(&rt_b->rt_runtime_lock);
+                raw_spin_lock(&rt_b->rt_runtime_lock);
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = rt_b->rt_runtime;
                rt_rq->rt_time = 0;
                rt_rq->rt_throttled = 0;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                spin_unlock(&rt_b->rt_runtime_lock);
+                raw_spin_unlock(&rt_b->rt_runtime_lock);
        }
 }
@@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq)
 {
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __enable_runtime(rq);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
        int more = 0;
        if (rt_rq->rt_time > rt_rq->rt_runtime) {
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                more = do_balance_runtime(rt_rq);
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
        }
        return more;
@@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                struct rq *rq = rq_of_rt_rq(rt_rq);
-                spin_lock(&rq->lock);
+                raw_spin_lock(&rq->lock);
                if (rt_rq->rt_time) {
                        u64 runtime;
-                        spin_lock(&rt_rq->rt_runtime_lock);
+                        raw_spin_lock(&rt_rq->rt_runtime_lock);
                        if (rt_rq->rt_throttled)
                                balance_runtime(rt_rq);
                        runtime = rt_rq->rt_runtime;
@@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
-                        spin_unlock(&rt_rq->rt_runtime_lock);
+                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
                } else if (rt_rq->rt_nr_running)
                        idle = 0;
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
-                spin_unlock(&rq->lock);
+                raw_spin_unlock(&rq->lock);
        }
        return idle;
@@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq)
                rt_rq = rt_rq_of_se(rt_se);
                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                        spin_lock(&rt_rq->rt_runtime_lock);
+                        raw_spin_lock(&rt_rq->rt_runtime_lock);
                        rt_rq->rt_time += delta_exec;
                        if (sched_rt_runtime_exceeded(rt_rq))
                                resched_task(curr);
-                        spin_unlock(&rt_rq->rt_runtime_lock);
+                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
                }
        }
 }
@@ -1246,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->se.on_rq)) {
-                                spin_unlock(&lowest_rq->lock);
+                                raw_spin_unlock(&lowest_rq->lock);
                                lowest_rq = NULL;
                                break;
                        }
@@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq)
 * If we are not running and we are not going to reschedule soon, we should
 * try to push tasks away now
 */
-static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
+static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
@@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct task_struct *task)
+unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
@@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = {
        .rq_offline             = rq_offline_rt,
        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
-        .task_wake_up           = task_wake_up_rt,
+        .task_woken             = task_woken_rt,
        .switched_from          = switched_from_rt,
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 6b982f2cf52..934ae5e687b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
        struct user_struct *user;
        /*
-         * We won't get problems with the target's UID changing under us
+         * Protect access to @t credentials. This can go away when all
-         * because changing it requires RCU be used, and if t != current, the
+         * callers hold rcu read lock.
-         * caller must be holding the RCU readlock (by way of a spinlock) and
-         * we use RCU protection here
         */
+        rcu_read_lock();
        user = get_uid(__task_cred(t)->user);
        atomic_inc(&user->sigpending);
+        rcu_read_unlock();
        if (override_rlimit ||
            atomic_read(&user->sigpending) <=
@@ -423,7 +423,7 @@ still_pending:
                 */
                info->si_signo = sig;
                info->si_errno = 0;
-                info->si_code = 0;
+                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
@@ -607,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
        return 1;
 }
+static inline int is_si_special(const struct siginfo *info)
+{
+        return info <= SEND_SIG_FORCED;
+}
+static inline bool si_fromuser(const struct siginfo *info)
+{
+        return info == SEND_SIG_NOINFO ||
+                (!is_si_special(info) && SI_FROMUSER(info));
+}
 /*
 * Bad permissions for sending the signal
 * - the caller must hold at least the RCU read lock
@@ -621,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (!valid_signal(sig))
                return -EINVAL;
-        if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
+        if (!si_fromuser(info))
                return 0;
        error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -949,9 +960,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
        int from_ancestor_ns = 0;
 #ifdef CONFIG_PID_NS
-        if (!is_si_special(info) && SI_FROMUSER(info) &&
+        from_ancestor_ns = si_fromuser(info) &&
-                        task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
+                           !task_pid_nr_ns(current, task_active_pid_ns(t));
-                from_ancestor_ns = 1;
 #endif
        return __send_signal(sig, info, t, group, from_ancestor_ns);
@@ -969,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
                for (i = 0; i < 16; i++) {
                        unsigned char insn;
-                        __get_user(insn, (unsigned char *)(regs->ip + i));
+                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
+                                break;
                        printk("%02x ", insn);
                }
        }
@@ -1052,12 +1063,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
        return ret;
 }
-void
-force_sig_specific(int sig, struct task_struct *t)
-{
-        force_sig_info(sig, SEND_SIG_FORCED, t);
-}
 /*
 * Nuke all other threads in the group.
 */
@@ -1175,19 +1180,19 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
        int ret = -EINVAL;
        struct task_struct *p;
        const struct cred *pcred;
+        unsigned long flags;
        if (!valid_signal(sig))
                return ret;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        pcred = __task_cred(p);
-        if ((info == SEND_SIG_NOINFO ||
+        if (si_fromuser(info) &&
-             (!is_si_special(info) && SI_FROMUSER(info))) &&
            euid != pcred->suid && euid != pcred->uid &&
            uid  != pcred->suid && uid  != pcred->uid) {
                ret = -EPERM;
@@ -1196,14 +1201,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
        ret = security_task_kill(p, info, sig, secid);
        if (ret)
                goto out_unlock;
-        if (sig && p->sighand) {
-                unsigned long flags;
+        if (sig) {
-                spin_lock_irqsave(&p->sighand->siglock, flags);
+                if (lock_task_sighand(p, &flags)) {
-                ret = __send_signal(sig, info, p, 1, 0);
+                        ret = __send_signal(sig, info, p, 1, 0);
-                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                        unlock_task_sighand(p, &flags);
+                } else
+                        ret = -ESRCH;
        }
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
 EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
@@ -1837,11 +1844,6 @@ relock:
        for (;;) {
                struct k_sigaction *ka;
-                if (unlikely(signal->group_stop_count > 0) &&
-                    do_signal_stop(0))
-                        goto relock;
                /*
                 * Tracing can induce an artifical signal and choose sigaction.
                 * The return value in @signr determines the default action,
@@ -1853,6 +1855,10 @@ relock:
                if (unlikely(signr != 0))
                        ka = return_ka;
                else {
+                        if (unlikely(signal->group_stop_count > 0) &&
+                            do_signal_stop(0))
+                                goto relock;
                        signr = dequeue_signal(current, &current->blocked,
                                               info);
diff --git a/kernel/smp.c b/kernel/smp.c
index a8c76069cf5..f1040842244 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
        struct list_head        queue;
-        spinlock_t              lock;
+        raw_spinlock_t          lock;
 } call_function __cacheline_aligned_in_smp =
        {
                .queue          = LIST_HEAD_INIT(call_function.queue),
-                .lock           = __SPIN_LOCK_UNLOCKED(call_function.lock),
+                .lock           = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
        };
 enum {
@@ -35,7 +35,7 @@ struct call_function_data {
 struct call_single_queue {
        struct list_head        list;
-        spinlock_t              lock;
+        raw_spinlock_t          lock;
 };
 static DEFINE_PER_CPU(struct call_function_data, cfd_data);
@@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void)
        for_each_possible_cpu(i) {
                struct call_single_queue *q = &per_cpu(call_single_queue, i);
-                spin_lock_init(&q->lock);
+                raw_spin_lock_init(&q->lock);
                INIT_LIST_HEAD(&q->list);
        }
@@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
        unsigned long flags;
        int ipi;
-        spin_lock_irqsave(&dst->lock, flags);
+        raw_spin_lock_irqsave(&dst->lock, flags);
        ipi = list_empty(&dst->list);
        list_add_tail(&data->list, &dst->list);
-        spin_unlock_irqrestore(&dst->lock, flags);
+        raw_spin_unlock_irqrestore(&dst->lock, flags);
        /*
         * The list addition should be visible before sending the IPI
@@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 void generic_smp_call_function_interrupt(void)
 {
        struct call_function_data *data;
-        int cpu = get_cpu();
+        int cpu = smp_processor_id();
        /*
         * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void)
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
                if (!refs) {
-                        spin_lock(&call_function.lock);
+                        raw_spin_lock(&call_function.lock);
                        list_del_rcu(&data->csd.list);
-                        spin_unlock(&call_function.lock);
+                        raw_spin_unlock(&call_function.lock);
                }
                if (refs)
@@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
                csd_unlock(&data->csd);
        }
-        put_cpu();
 }
 /*
@@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void)
         */
        WARN_ON_ONCE(!cpu_online(smp_processor_id()));
-        spin_lock(&q->lock);
+        raw_spin_lock(&q->lock);
        list_replace_init(&q->list, &list);
-        spin_unlock(&q->lock);
+        raw_spin_unlock(&q->lock);
        while (!list_empty(&list)) {
                struct call_single_data *data;
@@ -348,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask,
                goto call;
        /* Try for same node. */
-        nodemask = cpumask_of_node(cpu);
+        nodemask = cpumask_of_node(cpu_to_node(cpu));
        for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
             cpu = cpumask_next_and(cpu, nodemask, mask)) {
                if (cpu_online(cpu))
@@ -449,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask,
        cpumask_clear_cpu(this_cpu, data->cpumask);
        atomic_set(&data->refs, cpumask_weight(data->cpumask));
-        spin_lock_irqsave(&call_function.lock, flags);
+        raw_spin_lock_irqsave(&call_function.lock, flags);
        /*
         * Place entry at the _HEAD_ of the list, so that any cpu still
         * observing the entry in generic_smp_call_function_interrupt()
         * will not miss any other list entries:
         */
        list_add_rcu(&data->csd.list, &call_function.queue);
-        spin_unlock_irqrestore(&call_function.lock, flags);
+        raw_spin_unlock_irqrestore(&call_function.lock, flags);
        /*
         * Make the list addition visible before sending the ipi.
@@ -501,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function);
 void ipi_call_lock(void)
 {
-        spin_lock(&call_function.lock);
+        raw_spin_lock(&call_function.lock);
 }
 void ipi_call_unlock(void)
 {
-        spin_unlock(&call_function.lock);
+        raw_spin_unlock(&call_function.lock);
 }
 void ipi_call_lock_irq(void)
 {
-        spin_lock_irq(&call_function.lock);
+        raw_spin_lock_irq(&call_function.lock);
 }
 void ipi_call_unlock_irq(void)
 {
-        spin_unlock_irq(&call_function.lock);
+        raw_spin_unlock_irq(&call_function.lock);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 21939d9e830..7c1a67ef027 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
 */
 /*
- * The trampoline is called when the hrtimer expires. If this is
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * called from the hrtimer interrupt then we schedule the tasklet as
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * the timer callback function expects to run in softirq context. If
+ * hrtimer callback, but from softirq context.
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
 */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
        struct tasklet_hrtimer *ttimer =
                container_of(timer, struct tasklet_hrtimer, timer);
-        if (hrtimer_is_hres_active(timer)) {
+        tasklet_hi_schedule(&ttimer->tasklet);
-                tasklet_hi_schedule(&ttimer->tasklet);
+        return HRTIMER_NORESTART;
-                return HRTIMER_NORESTART;
-        }
-        return ttimer->function(timer);
 }
 /*
@@ -697,7 +692,7 @@ void __init softirq_init(void)
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
-static int ksoftirqd(void * __bind_cpu)
+static int run_ksoftirqd(void * __bind_cpu)
 {
        set_current_state(TASK_INTERRUPTIBLE);
@@ -810,7 +805,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb3..0d4c7898ab8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,10 @@
 static DEFINE_SPINLOCK(print_lock);
-static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
-static DEFINE_PER_CPU(unsigned long, print_timestamp);
+static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
-static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -70,22 +71,28 @@ static void __touch_softlockup_watchdog(void)
 {
        int this_cpu = raw_smp_processor_id();
-        __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
+        __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
 }
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(touch_timestamp) = 0;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_softlockup_watchdog_sync(void)
+{
+        __raw_get_cpu_var(softlock_touch_sync) = true;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
 void touch_all_softlockup_watchdogs(void)
 {
        int cpu;
        /* Cause each CPU to re-update its timestamp rather than complain */
        for_each_online_cpu(cpu)
-                per_cpu(touch_timestamp, cpu) = 0;
+                per_cpu(softlockup_touch_ts, cpu) = 0;
 }
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
@@ -104,28 +111,36 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 void softlockup_tick(void)
 {
        int this_cpu = smp_processor_id();
-        unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+        unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
-        unsigned long print_timestamp;
+        unsigned long print_ts;
        struct pt_regs *regs = get_irq_regs();
        unsigned long now;
        /* Is detection switched off? */
-        if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+        if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
                /* Be sure we don't false trigger if switched back on */
-                if (touch_timestamp)
+                if (touch_ts)
-                        per_cpu(touch_timestamp, this_cpu) = 0;
+                        per_cpu(softlockup_touch_ts, this_cpu) = 0;
                return;
        }
-        if (touch_timestamp == 0) {
+        if (touch_ts == 0) {
+                if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+                        /*
+                         * If the time stamp was touched atomically
+                         * make sure the scheduler tick is up to date.
+                         */
+                        per_cpu(softlock_touch_sync, this_cpu) = false;
+                        sched_clock_tick();
+                }
                __touch_softlockup_watchdog();
                return;
        }
-        print_timestamp = per_cpu(print_timestamp, this_cpu);
+        print_ts = per_cpu(softlockup_print_ts, this_cpu);
        /* report at most once a second */
-        if (print_timestamp == touch_timestamp || did_panic)
+        if (print_ts == touch_ts || did_panic)
                return;
        /* do not print during early bootup: */
@@ -140,18 +155,18 @@ void softlockup_tick(void)
         * Wake up the high-prio watchdog task twice per
         * threshold timespan.
         */
-        if (now > touch_timestamp + softlockup_thresh/2)
+        if (now > touch_ts + softlockup_thresh/2)
-                wake_up_process(per_cpu(watchdog_task, this_cpu));
+                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
        /* Warn about unreasonable delays: */
-        if (now <= (touch_timestamp + softlockup_thresh))
+        if (now <= (touch_ts + softlockup_thresh))
                return;
-        per_cpu(print_timestamp, this_cpu) = touch_timestamp;
+        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
        spin_lock(&print_lock);
        printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
-                        this_cpu, now - touch_timestamp,
+                        this_cpu, now - touch_ts,
                        current->comm, task_pid_nr(current));
        print_modules();
        print_irqtrace_events(current);
@@ -209,32 +224,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                BUG_ON(per_cpu(watchdog_task, hotcpu));
+                BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
                }
-                per_cpu(touch_timestamp, hotcpu) = 0;
+                per_cpu(softlockup_touch_ts, hotcpu) = 0;
-                per_cpu(watchdog_task, hotcpu) = p;
+                per_cpu(softlockup_watchdog, hotcpu) = p;
                kthread_bind(p, hotcpu);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                wake_up_process(per_cpu(watchdog_task, hotcpu));
+                wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                if (!per_cpu(watchdog_task, hotcpu))
+                if (!per_cpu(softlockup_watchdog, hotcpu))
                        break;
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(watchdog_task, hotcpu),
+                kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                p = per_cpu(watchdog_task, hotcpu);
+                p = per_cpu(softlockup_watchdog, hotcpu);
-                per_cpu(watchdog_task, hotcpu) = NULL;
+                per_cpu(softlockup_watchdog, hotcpu) = NULL;
                kthread_stop(p);
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 41e042219ff..be6517fb9c1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -32,6 +32,8 @@
 * include/linux/spinlock_api_smp.h
 */
 #else
+#define raw_read_can_lock(l)    read_can_lock(l)
+#define raw_write_can_lock(l)   write_can_lock(l)
 /*
 * We build the __lock_function inlines here. They are too large for
 * inlining all over the place, but here is only one user per function
@@ -42,49 +44,49 @@
 * towards that other CPU that it should break the lock ASAP.
 */
 #define BUILD_LOCK_OPS(op, locktype)                                    \
-void __lockfunc __##op##_lock(locktype##_t *lock)                       \
+void __lockfunc __raw_##op##_lock(locktype##_t *lock)                   \
 {                                                                       \
        for (;;) {                                                      \
                preempt_disable();                                      \
-                if (likely(_raw_##op##_trylock(lock)))                  \
+                if (likely(do_raw_##op##_trylock(lock)))                \
                        break;                                          \
                preempt_enable();                                       \
                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
-                while (!op##_can_lock(lock) && (lock)->break_lock)      \
+                while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
-                        _raw_##op##_relax(&lock->raw_lock);             \
+                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
        (lock)->break_lock = 0;                                         \
 }                                                                       \
                                                                        \
-unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock)      \
+unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)  \
 {                                                                       \
        unsigned long flags;                                            \
                                                                        \
        for (;;) {                                                      \
                preempt_disable();                                      \
                local_irq_save(flags);                                  \
-                if (likely(_raw_##op##_trylock(lock)))                  \
+                if (likely(do_raw_##op##_trylock(lock)))                \
                        break;                                          \
                local_irq_restore(flags);                               \
                preempt_enable();                                       \
                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
-                while (!op##_can_lock(lock) && (lock)->break_lock)      \
+                while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
-                        _raw_##op##_relax(&lock->raw_lock);             \
+                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
        (lock)->break_lock = 0;                                         \
        return flags;                                                   \
 }                                                                       \
                                                                        \
-void __lockfunc __##op##_lock_irq(locktype##_t *lock)                   \
+void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock)               \
 {                                                                       \
-        _##op##_lock_irqsave(lock);                                     \
+        _raw_##op##_lock_irqsave(lock);                                 \
 }                                                                       \
                                                                        \
-void __lockfunc __##op##_lock_bh(locktype##_t *lock)                    \
+void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)                \
 {                                                                       \
        unsigned long flags;                                            \
                                                                        \
@@ -93,7 +95,7 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock)			\
        /* irq-disabling. We use the generic preemption-aware   */      \
        /* function:                                            */      \
        /**/                                                            \
-        flags = _##op##_lock_irqsave(lock);                             \
+        flags = _raw_##op##_lock_irqsave(lock);                         \
        local_bh_disable();                                             \
        local_irq_restore(flags);                                       \
 }                                                                       \
@@ -107,269 +109,269 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock)			\
 *         __[spin|read|write]_lock_irqsave()
 *         __[spin|read|write]_lock_bh()
 */
-BUILD_LOCK_OPS(spin, spinlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
 BUILD_LOCK_OPS(read, rwlock);
 BUILD_LOCK_OPS(write, rwlock);
 #endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 {
-        preempt_disable();
+        return __raw_spin_trylock(lock);
-        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_lock_nested);
+EXPORT_SYMBOL(_raw_spin_trylock);
+#endif
-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock,
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-                                                   int subclass)
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
 {
-        unsigned long flags;
+        return __raw_spin_trylock_bh(lock);
-        local_irq_save(flags);
-        preempt_disable();
-        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-        LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
-                                _raw_spin_lock_flags, &flags);
-        return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
+#endif
-void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+#ifndef CONFIG_INLINE_SPIN_LOCK
-                                     struct lockdep_map *nest_lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
-        preempt_disable();
+        __raw_spin_lock(lock);
-        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
-        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_lock_nest_lock);
+EXPORT_SYMBOL(_raw_spin_lock);
 #endif
-#ifndef CONFIG_INLINE_SPIN_TRYLOCK
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-int __lockfunc _spin_trylock(spinlock_t *lock)
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
 {
-        return __spin_trylock(lock);
+        return __raw_spin_lock_irqsave(lock);
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
 #endif
-#ifndef CONFIG_INLINE_READ_TRYLOCK
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-int __lockfunc _read_trylock(rwlock_t *lock)
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
 {
-        return __read_trylock(lock);
+        __raw_spin_lock_irq(lock);
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(_raw_spin_lock_irq);
 #endif
-#ifndef CONFIG_INLINE_WRITE_TRYLOCK
+#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-int __lockfunc _write_trylock(rwlock_t *lock)
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 {
-        return __write_trylock(lock);
+        __raw_spin_lock_bh(lock);
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(_raw_spin_lock_bh);
 #endif
-#ifndef CONFIG_INLINE_READ_LOCK
+#ifndef CONFIG_INLINE_SPIN_UNLOCK
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
-        __read_lock(lock);
+        __raw_spin_unlock(lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(_raw_spin_unlock);
 #endif
-#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
-        return __spin_lock_irqsave(lock);
+        __raw_spin_unlock_irqrestore(lock, flags);
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
 #endif
-#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-        __spin_lock_irq(lock);
+        __raw_spin_unlock_irq(lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
 #endif
-#ifndef CONFIG_INLINE_SPIN_LOCK_BH
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 {
-        __spin_lock_bh(lock);
+        __raw_spin_unlock_bh(lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
 #endif
-#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
+#ifndef CONFIG_INLINE_READ_TRYLOCK
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+int __lockfunc _raw_read_trylock(rwlock_t *lock)
 {
-        return __read_lock_irqsave(lock);
+        return __raw_read_trylock(lock);
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(_raw_read_trylock);
 #endif
-#ifndef CONFIG_INLINE_READ_LOCK_IRQ
+#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock(rwlock_t *lock)
 {
-        __read_lock_irq(lock);
+        __raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(_raw_read_lock);
 #endif
-#ifndef CONFIG_INLINE_READ_LOCK_BH
+#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
 {
-        __read_lock_bh(lock);
+        return __raw_read_lock_irqsave(lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
 #endif
-#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
+#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
 {
-        return __write_lock_irqsave(lock);
+        __raw_read_lock_irq(lock);
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(_raw_read_lock_irq);
 #endif
-#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
+#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
 {
-        __write_lock_irq(lock);
+        __raw_read_lock_bh(lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(_raw_read_lock_bh);
 #endif
-#ifndef CONFIG_INLINE_WRITE_LOCK_BH
+#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_unlock(rwlock_t *lock)
 {
-        __write_lock_bh(lock);
+        __raw_read_unlock(lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(_raw_read_unlock);
 #endif
-#ifndef CONFIG_INLINE_SPIN_LOCK
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
-        __spin_lock(lock);
+        __raw_read_unlock_irqrestore(lock, flags);
 }
-EXPORT_SYMBOL(_spin_lock);
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
 #endif
-#ifndef CONFIG_INLINE_WRITE_LOCK
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
 {
-        __write_lock(lock);
+        __raw_read_unlock_irq(lock);
 }
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(_raw_read_unlock_irq);
 #endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK
+#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
 {
-        __spin_unlock(lock);
+        __raw_read_unlock_bh(lock);
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(_raw_read_unlock_bh);
 #endif
-#ifndef CONFIG_INLINE_WRITE_UNLOCK
+#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-void __lockfunc _write_unlock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(rwlock_t *lock)
 {
-        __write_unlock(lock);
+        return __raw_write_trylock(lock);
 }
-EXPORT_SYMBOL(_write_unlock);
+EXPORT_SYMBOL(_raw_write_trylock);
 #endif
-#ifndef CONFIG_INLINE_READ_UNLOCK
+#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(rwlock_t *lock)
 {
-        __read_unlock(lock);
+        __raw_write_lock(lock);
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(_raw_write_lock);
 #endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
 {
-        __spin_unlock_irqrestore(lock, flags);
+        return __raw_write_lock_irqsave(lock);
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
 #endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
 {
-        __spin_unlock_irq(lock);
+        __raw_write_lock_irq(lock);
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(_raw_write_lock_irq);
 #endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
+#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
 {
-        __spin_unlock_bh(lock);
+        __raw_write_lock_bh(lock);
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(_raw_write_lock_bh);
 #endif
-#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
+#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_write_unlock(rwlock_t *lock)
 {
-        __read_unlock_irqrestore(lock, flags);
+        __raw_write_unlock(lock);
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_unlock);
 #endif
-#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
-        __read_unlock_irq(lock);
+        __raw_write_unlock_irqrestore(lock, flags);
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
 #endif
-#ifndef CONFIG_INLINE_READ_UNLOCK_BH
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
 {
-        __read_unlock_bh(lock);
+        __raw_write_unlock_irq(lock);
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(_raw_write_unlock_irq);
 #endif
-#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
 {
-        __write_unlock_irqrestore(lock, flags);
+        __raw_write_unlock_bh(lock);
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_unlock_bh);
 #endif
-#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 {
-        __write_unlock_irq(lock);
+        preempt_disable();
+        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(_raw_spin_lock_nested);
-#endif
-#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
+unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+                                                   int subclass)
 {
-        __write_unlock_bh(lock);
+        unsigned long flags;
+        local_irq_save(flags);
+        preempt_disable();
+        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+        LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
+                                do_raw_spin_lock_flags, &flags);
+        return flags;
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
-#endif
-#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
+void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+                                     struct lockdep_map *nest_lock)
 {
-        return __spin_trylock_bh(lock);
+        preempt_disable();
+        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
 #endif
 notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5fb55b..26a6b73a6b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
@@ -163,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
        if (niceval > 19)
                niceval = 19;
+        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
                case PRIO_PROCESS:
@@ -190,16 +190,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
                                 !(user = find_user(who)))
                                goto out_unlock;        /* No processes for this user */
-                        do_each_thread(g, p)
+                        do_each_thread(g, p) {
                                if (__task_cred(p)->uid == who)
                                        error = set_one_prio(p, niceval, error);
-                        while_each_thread(g, p);
+                        } while_each_thread(g, p);
                        if (who != cred->uid)
                                free_uid(user);         /* For find_user() */
                        break;
        }
 out_unlock:
        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
 out:
        return error;
 }
@@ -253,13 +254,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                                 !(user = find_user(who)))
                                goto out_unlock;        /* No processes for this user */
-                        do_each_thread(g, p)
+                        do_each_thread(g, p) {
                                if (__task_cred(p)->uid == who) {
                                        niceval = 20 - task_nice(p);
                                        if (niceval > retval)
                                                retval = niceval;
                                }
-                        while_each_thread(g, p);
+                        } while_each_thread(g, p);
                        if (who != cred->uid)
                                free_uid(user);         /* for find_user() */
                        break;
@@ -349,6 +350,9 @@ void kernel_power_off(void)
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
+static DEFINE_MUTEX(reboot_mutex);
 /*
 * Reboot system call: for obvious reasons only root may call it,
 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +385,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
                cmd = LINUX_REBOOT_CMD_HALT;
-        lock_kernel();
+        mutex_lock(&reboot_mutex);
        switch (cmd) {
        case LINUX_REBOOT_CMD_RESTART:
                kernel_restart(NULL);
@@ -397,20 +401,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        case LINUX_REBOOT_CMD_HALT:
                kernel_halt();
-                unlock_kernel();
                do_exit(0);
                panic("cannot halt");
        case LINUX_REBOOT_CMD_POWER_OFF:
                kernel_power_off();
-                unlock_kernel();
                do_exit(0);
                break;
        case LINUX_REBOOT_CMD_RESTART2:
                if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-                        unlock_kernel();
+                        ret = -EFAULT;
-                        return -EFAULT;
+                        break;
                }
                buffer[sizeof(buffer) - 1] = '\0';
@@ -433,7 +435,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                ret = -EINVAL;
                break;
        }
-        unlock_kernel();
+        mutex_unlock(&reboot_mutex);
        return ret;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c..8a68b244846 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_min_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_latency,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_wakeup_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_shares_ratelimit,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_shares_ratelimit,
+                .extra2         = &max_sched_shares_ratelimit,
+        },
+        {
+                .procname       = "sched_tunable_scaling",
+                .data           = &sysctl_sched_tunable_scaling,
+                .maxlen         = sizeof(enum sched_tunable_scaling),
+                .mode           = 0644,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_tunable_scaling,
+                .extra2         = &max_sched_tunable_scaling,
        },
        {
                .procname       = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "sched_features",
-                .data           = &sysctl_sched_features,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
@@ -1043,7 +1051,7 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one_hundred,
        },
 #ifdef CONFIG_HUGETLB_PAGE
-         {
+        {
                .procname       = "nr_hugepages",
                .data           = NULL,
                .maxlen         = sizeof(unsigned long),
@@ -1051,7 +1059,18 @@ static struct ctl_table vm_table[] = {
                .proc_handler   = hugetlb_sysctl_handler,
                .extra1         = (void *)&hugetlb_zero,
                .extra2         = (void *)&hugetlb_infinity,
-         },
+        },
+#ifdef CONFIG_NUMA
+        {
+                .procname       = "nr_hugepages_mempolicy",
+                .data           = NULL,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
+                .extra1         = (void *)&hugetlb_zero,
+                .extra2         = (void *)&hugetlb_infinity,
+        },
+#endif
         {
                .procname       = "hugetlb_shm_group",
                .data           = &sysctl_hugetlb_shm_group,
@@ -1112,7 +1131,8 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_max_map_count,
                .maxlen         = sizeof(sysctl_max_map_count),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
 #else
        {
@@ -1194,6 +1214,7 @@ static struct ctl_table vm_table[] = {
                .proc_handler   = proc_dointvec_jiffies,
        },
 #endif
+#ifdef CONFIG_MMU
        {
                .procname       = "mmap_min_addr",
                .data           = &dac_mmap_min_addr,
@@ -1201,6 +1222,7 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = mmap_min_addr_handler,
        },
+#endif
 #ifdef CONFIG_NUMA
        {
                .procname       = "numa_zonelist_order",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b75dbf40f57..8f5d16e0707 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1399,6 +1399,13 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
 {
        int i;
+        /*
+         * CTL_KERN/KERN_VERSION is used by older glibc and cannot
+         * ever go away.
+         */
+        if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
+                return;
        if (printk_ratelimit()) {
                printk(KERN_INFO
                        "warning: process `%s' used the deprecated sysctl "
@@ -1410,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
        return;
 }
+#define WARN_ONCE_HASH_BITS 8
+#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
+static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
+#define FNV32_OFFSET 2166136261U
+#define FNV32_PRIME 0x01000193
+/*
+ * Print each legacy sysctl (approximately) only once.
+ * To avoid making the tables non-const use a external
+ * hash-table instead.
+ * Worst case hash collision: 6, but very rarely.
+ * NOTE! We don't use the SMP-safe bit tests. We simply
+ * don't care enough.
+ */
+static void warn_on_bintable(const int *name, int nlen)
+{
+        int i;
+        u32 hash = FNV32_OFFSET;
+        for (i = 0; i < nlen; i++)
+                hash = (hash ^ name[i]) * FNV32_PRIME;
+        hash %= WARN_ONCE_HASH_SIZE;
+        if (__test_and_set_bit(hash, warn_once_bitmap))
+                return;
+        deprecated_sysctl_warning(name, nlen);
+}
 static ssize_t do_sysctl(int __user *args_name, int nlen,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
@@ -1424,7 +1460,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen,
                if (get_user(name[i], args_name + i))
                        return -EFAULT;
-        deprecated_sysctl_warning(name, nlen);
+        warn_on_bintable(name, nlen);
        return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
 }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc3..d7395fdfb9f 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
 #include <linux/sysdev.h>
 #include <linux/tick.h>
+#include "tick-internal.h"
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
 static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
 static RAW_NOTIFIER_HEAD(clockevents_chain);
 /* Protection for the above */
-static DEFINE_SPINLOCK(clockevents_lock);
+static DEFINE_RAW_SPINLOCK(clockevents_lock);
 /**
 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
 *
 * Math helper, returns latch value converted to nanoseconds (bound checked)
 */
-unsigned long clockevent_delta2ns(unsigned long latch,
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
-                                  struct clock_event_device *evt)
 {
-        u64 clc = ((u64) latch << evt->shift);
+        u64 clc = (u64) latch << evt->shift;
        if (unlikely(!evt->mult)) {
                evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
        do_div(clc, evt->mult);
        if (clc < 1000)
                clc = 1000;
-        if (clc > LONG_MAX)
+        if (clc > KTIME_MAX)
-                clc = LONG_MAX;
+                clc = KTIME_MAX;
-        return (unsigned long) clc;
+        return clc;
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
        unsigned long flags;
        int ret;
-        spin_lock_irqsave(&clockevents_lock, flags);
+        raw_spin_lock_irqsave(&clockevents_lock, flags);
        ret = raw_notifier_chain_register(&clockevents_chain, nb);
-        spin_unlock_irqrestore(&clockevents_lock, flags);
+        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
        return ret;
 }
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
        BUG_ON(!dev->cpumask);
-        spin_lock_irqsave(&clockevents_lock, flags);
+        raw_spin_lock_irqsave(&clockevents_lock, flags);
        list_add(&dev->list, &clockevent_devices);
        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
        clockevents_notify_released();
-        spin_unlock_irqrestore(&clockevents_lock, flags);
+        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
@@ -237,10 +238,11 @@ void clockevents_exchange_device(struct clock_event_device *old,
 */
 void clockevents_notify(unsigned long reason, void *arg)
 {
-        struct list_head *node, *tmp;
+        struct clock_event_device *dev, *tmp;
        unsigned long flags;
+        int cpu;
-        spin_lock_irqsave(&clockevents_lock, flags);
+        raw_spin_lock_irqsave(&clockevents_lock, flags);
        clockevents_do_notify(reason, arg);
        switch (reason) {
@@ -249,13 +251,25 @@ void clockevents_notify(unsigned long reason, void *arg)
                 * Unregister the clock event devices which were
                 * released from the users in the notify chain.
                 */
-                list_for_each_safe(node, tmp, &clockevents_released)
+                list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
-                        list_del(node);
+                        list_del(&dev->list);
+                /*
+                 * Now check whether the CPU has left unused per cpu devices
+                 */
+                cpu = *((int *)arg);
+                list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+                        if (cpumask_test_cpu(cpu, dev->cpumask) &&
+                            cpumask_weight(dev->cpumask) == 1 &&
+                            !tick_is_broadcast_device(dev)) {
+                                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+                                list_del(&dev->list);
+                        }
+                }
                break;
        default:
                break;
        }
-        spin_unlock_irqrestore(&clockevents_lock, flags);
+        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 #endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4a310906b3e..13700833c18 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,6 +107,59 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 }
 EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult:       pointer to mult variable
+ * @shift:      pointer to shift variable
+ * @from:       frequency to convert from
+ * @to:         frequency to convert to
+ * @minsec:     guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @minsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+{
+        u64 tmp;
+        u32 sft, sftacc= 32;
+        /*
+         * Calculate the shift factor which is limiting the conversion
+         * range:
+         */
+        tmp = ((u64)minsec * from) >> 32;
+        while (tmp) {
+                tmp >>=1;
+                sftacc--;
+        }
+        /*
+         * Find the conversion shift/mult pair which has the best
+         * accuracy and fits the maxsec conversion range:
+         */
+        for (sft = 32; sft > 0; sft--) {
+                tmp = (u64) to << sft;
+                do_div(tmp, from);
+                if ((tmp >> sftacc) == 0)
+                        break;
+        }
+        *mult = tmp;
+        *shift = sft;
+}
 /*[Clocksource internal variables]---------
 * curr_clocksource:
 *      currently selected clocksource.
@@ -290,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&watchdog_lock, flags);
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
        clocksource_reset_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -405,14 +470,55 @@ void clocksource_resume(void)
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
+ * to incorrectly trip the watchdog. This might fail when the kernel
- *
+ * was stopped in code which holds watchdog_lock.
 */
 void clocksource_touch_watchdog(void)
 {
        clocksource_resume_watchdog();
 }
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+        u64 max_nsecs, max_cycles;
+        /*
+         * Calculate the maximum number of cycles that we can pass to the
+         * cyc2ns function without overflowing a 64-bit signed result. The
+         * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+         * is equivalent to the below.
+         * max_cycles < (2^63)/cs->mult
+         * max_cycles < 2^(log2((2^63)/cs->mult))
+         * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+         * max_cycles < 2^(63 - log2(cs->mult))
+         * max_cycles < 1 << (63 - log2(cs->mult))
+         * Please note that we add 1 to the result of the log2 to account for
+         * any rounding errors, ensure the above inequality is satisfied and
+         * no overflow will occur.
+         */
+        max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+        /*
+         * The actual maximum number of cycles we can defer the clocksource is
+         * determined by the minimum of max_cycles and cs->mask.
+         */
+        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+        /*
+         * To ensure that the clocksource does not wrap whilst we are idle,
+         * limit the time the clocksource can be deferred by 12.5%. Please
+         * note a margin of 12.5% is used because this can be computed with
+         * a shift, versus say 10% which would require division.
+         */
+        return max_nsecs - (max_nsecs >> 5);
+}
 #ifdef CONFIG_GENERIC_TIME
 /**
@@ -511,6 +617,9 @@ static void clocksource_enqueue(struct clocksource *cs)
 */
 int clocksource_register(struct clocksource *cs)
 {
+        /* calculate max idle time permitted for this clocksource */
+        cs->max_idle_ns = clocksource_max_deferment(cs);
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_select();
@@ -580,7 +689,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 * @count:      length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
- * clocksource selction.
+ * clocksource selection.
 */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a3..b3bafd5fc66 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
 /* FIXME: Use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
 static DECLARE_BITMAP(tmpmask, NR_CPUS);
-static DEFINE_SPINLOCK(tick_broadcast_lock);
+static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 #ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
        unsigned long flags;
        int ret = 0;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
                        tick_broadcast_clear_oneshot(cpu);
                }
        }
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
        return ret;
 }
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
 */
 static void tick_do_periodic_broadcast(void)
 {
-        spin_lock(&tick_broadcast_lock);
+        raw_spin_lock(&tick_broadcast_lock);
        cpumask_and(to_cpumask(tmpmask),
                    cpu_online_mask, tick_get_broadcast_mask());
        tick_do_broadcast(to_cpumask(tmpmask));
-        spin_unlock(&tick_broadcast_lock);
+        raw_spin_unlock(&tick_broadcast_lock);
 }
 /*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
        unsigned long flags;
        int cpu, bc_stopped;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        cpu = smp_processor_id();
        td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                        tick_broadcast_setup_oneshot(bc);
        }
 out:
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 /*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        unsigned long flags;
        unsigned int cpu = *cpup;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
                        clockevents_shutdown(bc);
        }
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
        struct clock_event_device *bc;
        unsigned long flags;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
        if (bc)
                clockevents_shutdown(bc);
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
        unsigned long flags;
        int broadcast = 0;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
                        break;
                }
        }
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
        return broadcast;
 }
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
        ktime_t now, next_event;
        int cpu;
-        spin_lock(&tick_broadcast_lock);
+        raw_spin_lock(&tick_broadcast_lock);
 again:
        dev->next_event.tv64 = KTIME_MAX;
        next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
                if (tick_broadcast_set_event(next_event, 0))
                        goto again;
        }
-        spin_unlock(&tick_broadcast_lock);
+        raw_spin_unlock(&tick_broadcast_lock);
 }
 /*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        unsigned long flags;
        int cpu;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        }
 out:
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 /*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
        struct clock_event_device *bc;
        unsigned long flags;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
        bc = tick_broadcast_device.evtdev;
        if (bc)
                tick_broadcast_setup_oneshot(bc);
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        unsigned long flags;
        unsigned int cpu = *cpup;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
         */
        cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 /*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3..b6b898d2eee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 ktime_t tick_next_period;
 ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-DEFINE_SPINLOCK(tick_device_lock);
+static DEFINE_RAW_SPINLOCK(tick_device_lock);
 /*
 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        cpu = smp_processor_id();
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
        return NOTIFY_STOP;
 out_bc:
@@ -278,7 +278,7 @@ out_bc:
        if (tick_check_broadcast_device(newdev))
                ret = NOTIFY_STOP;
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
        return ret;
 }
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
        struct clock_event_device *dev = td->evtdev;
        unsigned long flags;
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        td->mode = TICKDEV_MODE_PERIODIC;
        if (dev) {
                /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
                clockevents_exchange_device(dev, NULL);
                td->evtdev = NULL;
        }
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
        unsigned long flags;
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        clockevents_shutdown(td->evtdev);
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
        unsigned long flags;
        int broadcast = tick_resume_broadcast();
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
        if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
                else
                        tick_resume_oneshot();
        }
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee..290eefbc1f6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
 #define TICK_DO_TIMER_BOOT      -2
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
-extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
 extern ktime_t tick_period;
 extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89c..0a8a213016f 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                                dev->min_delta_ns += dev->min_delta_ns >> 1;
                        printk(KERN_WARNING
-                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                               "CE: %s increasing min_delta_ns to %llu nsec\n",
                               dev->name ? dev->name : "?",
-                               dev->min_delta_ns << 1);
+                               (unsigned long long) dev->min_delta_ns << 1);
                        i = 0;
                }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed..f992762d7f5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
 * value. We do this unconditionally on any cpu, as we don't know whether the
 * cpu, which has the update task assigned is in a long sleep.
 */
-static void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        ktime_t now;
-        if (!ts->tick_stopped)
-                return;
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
-        now = ktime_get();
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
        touch_softlockup_watchdog();
 }
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t delta;
-        if (ts->idle_active) {
+        delta = ktime_sub(now, ts->idle_entrytime);
-                ktime_t now, delta;
+        ts->idle_lastupdate = now;
-                now = ktime_get();
+        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                delta = ktime_sub(now, ts->idle_entrytime);
+        ts->idle_active = 0;
-                ts->idle_lastupdate = now;
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                ts->idle_active = 0;
-                sched_clock_idle_wakeup_event(0);
+        sched_clock_idle_wakeup_event(0);
-        }
 }
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        u64 time_delta;
        int cpu;
        local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                if (ratelimit < 10) {
                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                               local_softirq_pending());
+                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
                goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
                seq = read_seqbegin(&xtime_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
+                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        /* Get the next timer wheel timer */
+        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-        next_jiffies = get_next_timer_interrupt(last_jiffies);
+            arch_needs_cpu(cpu)) {
-        delta_jiffies = next_jiffies - last_jiffies;
+                next_jiffies = last_jiffies + 1;
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
+        } else {
+                /* Get the next timer wheel timer */
+                next_jiffies = get_next_timer_interrupt(last_jiffies);
+                delta_jiffies = next_jiffies - last_jiffies;
+        }
        /*
         * Do not stop the tick, if we are only one off
         * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
        if ((long)delta_jiffies >= 1) {
                /*
-                * calculate the expiry time for the next timer wheel
-                * timer
-                */
-                expires = ktime_add_ns(last_update, tick_period.tv64 *
-                                   delta_jiffies);
-                /*
                 * If this cpu is the one which updates jiffies, then
                 * give up the assignment and let it be taken by the
                 * cpu which runs the tick timer next, which might be
                 * this cpu as well. If we don't drop this here the
                 * jiffies might be stale and do_timer() never
-                 * invoked.
+                 * invoked. Keep track of the fact that it was the one
+                 * which had the do_timer() duty last. If this cpu is
+                 * the one which had the do_timer() duty last, we
+                 * limit the sleep time to the timekeeping
+                 * max_deferement value which we retrieved
+                 * above. Otherwise we can sleep as long as we want.
                 */
-                if (cpu == tick_do_timer_cpu)
+                if (cpu == tick_do_timer_cpu) {
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                        ts->do_timer_last = 1;
+                } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+                        time_delta = KTIME_MAX;
+                        ts->do_timer_last = 0;
+                } else if (!ts->do_timer_last) {
+                        time_delta = KTIME_MAX;
+                }
+                /*
+                 * calculate the expiry time for the next timer wheel
+                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+                 * that there is no timer pending or at least extremely
+                 * far into the future (12 days for HZ=1000). In this
+                 * case we set the expiry to the end of time.
+                 */
+                if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+                        /*
+                         * Calculate the time delta for the next timer event.
+                         * If the time delta exceeds the maximum time delta
+                         * permitted by the current clocksource then adjust
+                         * the time delta accordingly to ensure the
+                         * clocksource does not wrap.
+                         */
+                        time_delta = min_t(u64, time_delta,
+                                           tick_period.tv64 * delta_jiffies);
+                }
+                if (time_delta < KTIME_MAX)
+                        expires = ktime_add_ns(last_update, time_delta);
+                else
+                        expires.tv64 = KTIME_MAX;
                if (delta_jiffies > 1)
                        cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
                ts->idle_sleeps++;
+                /* Mark expires */
+                ts->idle_expires = expires;
                /*
-                 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+                 * If the expiration time == KTIME_MAX, then
-                 * there is no timer pending or at least extremly far
+                 * in this case we simply stop the tick timer.
-                 * into the future (12 days for HZ=1000). In this case
-                 * we simply stop the tick timer:
                 */
-                if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+                 if (unlikely(expires.tv64 == KTIME_MAX)) {
-                        ts->idle_expires.tv64 = KTIME_MAX;
                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
                                hrtimer_cancel(&ts->sched_timer);
                        goto out;
                }
-                /* Mark expiries */
-                ts->idle_expires = expires;
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
                                      HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
        local_irq_disable();
-        tick_nohz_stop_idle(cpu);
+        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+                now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
        if (!ts->inidle || !ts->tick_stopped) {
                ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
-        now = ktime_get();
        tick_do_update_jiffies64(now);
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
 * timer and do not touch the other magic bits which need to be done
 * when idle is left.
 */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
        /* Switch back to 2.6.27 behaviour */
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t delta, now;
+        ktime_t delta;
-        if (!ts->tick_stopped)
-                return;
        /*
         * Do not touch the tick device, when the next expiry is either
         * already reached or less/equal than the tick period.
         */
-        now = ktime_get();
        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
        if (delta.tv64 <= tick_period.tv64)
                return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
+static inline void tick_check_nohz(int cpu)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t now;
+        if (!ts->idle_active && !ts->tick_stopped)
+                return;
+        now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
+        if (ts->tick_stopped) {
+                tick_nohz_update_jiffies(now);
+                tick_nohz_kick_tick(cpu, now);
+        }
+}
 #else
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 #endif /* NO_HZ */
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
        tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
+        tick_check_nohz(cpu);
-        tick_nohz_stop_idle(cpu);
-        tick_nohz_update_jiffies();
-        tick_nohz_kick_tick(cpu);
-#endif
 }
 /*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 96ff643a5a5..12f5c55090b 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync,
                         * source time
                         */
                        sample.offset =
-                                ktime_to_ns(ktime_add(end, start)) / 2 -
+                                (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
                                ts;
                        /* simple insertion sort based on duration */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907ea..e2ab064c6d4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
 {
        xtime.tv_sec += leapsecond;
        wall_to_monotonic.tv_sec -= leapsecond;
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 #ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
        timekeeper.ntp_error = 0;
        ntp_clear();
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -488,6 +488,17 @@ int timekeeping_valid_for_hres(void)
 }
 /**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ *
+ * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
+ * ensure that the clocksource does not change!
+ */
+u64 timekeeping_max_deferment(void)
+{
+        return timekeeper.clock->max_idle_ns;
+}
+/**
 * read_persistent_clock -  Return time from the persistent clock.
 *
 * Weak dummy function for arches that do not yet support it.
@@ -722,6 +733,51 @@ static void timekeeping_adjust(s64 offset)
                                timekeeper.ntp_error_shift;
 }
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+        /* If the offset is smaller then a shifted interval, do nothing */
+        if (offset < timekeeper.cycle_interval<<shift)
+                return offset;
+        /* Accumulate one shifted interval */
+        offset -= timekeeper.cycle_interval << shift;
+        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+        while (timekeeper.xtime_nsec >= nsecps) {
+                timekeeper.xtime_nsec -= nsecps;
+                xtime.tv_sec++;
+                second_overflow();
+        }
+        /* Accumulate into raw time */
+        raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+        while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+                raw_time.tv_nsec -= NSEC_PER_SEC;
+                raw_time.tv_sec++;
+        }
+        /* Accumulate error between NTP and clock interval */
+        timekeeper.ntp_error += tick_length << shift;
+        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+                                (timekeeper.ntp_error_shift + shift);
+        return offset;
+}
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
@@ -732,6 +788,7 @@ void update_wall_time(void)
        struct clocksource *clock;
        cycle_t offset;
        u64 nsecs;
+        int shift = 0, maxshift;
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
@@ -745,33 +802,22 @@ void update_wall_time(void)
 #endif
        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
-        /* normally this loop will run just once, however in the
+        /*
-         * case of lost or late ticks, it will accumulate correctly.
+         * With NO_HZ we may have to accumulate many cycle_intervals
+         * (think "ticks") worth of time at once. To do this efficiently,
+         * we calculate the largest doubling multiple of cycle_intervals
+         * that is smaller then the offset. We then accumulate that
+         * chunk in one go, and then try to consume the next smaller
+         * doubled multiple.
         */
+        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = max(0, shift);
+        /* Bound shift to one less then what overflows tick_length */
+        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
-                u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+                offset = logarithmic_accumulation(offset, shift);
+                shift--;
-                /* accumulate one interval */
-                offset -= timekeeper.cycle_interval;
-                clock->cycle_last += timekeeper.cycle_interval;
-                timekeeper.xtime_nsec += timekeeper.xtime_interval;
-                if (timekeeper.xtime_nsec >= nsecps) {
-                        timekeeper.xtime_nsec -= nsecps;
-                        xtime.tv_sec++;
-                        second_overflow();
-                }
-                raw_time.tv_nsec += timekeeper.raw_interval;
-                if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-                        raw_time.tv_nsec -= NSEC_PER_SEC;
-                        raw_time.tv_sec++;
-                }
-                /* accumulate error between NTP and clock interval */
-                timekeeper.ntp_error += tick_length;
-                timekeeper.ntp_error -= timekeeper.xtime_interval <<
-                                        timekeeper.ntp_error_shift;
        }
        /* correct the clock when NTP error is too big */
@@ -811,7 +857,7 @@ void update_wall_time(void)
        update_xtime_cache(nsecs);
        /* check to see if there is a new clocksource to use */
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 /**
@@ -834,6 +880,7 @@ void getboottime(struct timespec *ts)
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(getboottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -843,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts)
 {
        *ts = timespec_add_safe(*ts, total_sleep_time);
 }
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdf..bdfb8dd1050 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 next_one:
        i = 0;
-        spin_lock_irqsave(&base->cpu_base->lock, flags);
+        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
        curr = base->first;
        /*
@@ -100,13 +100,13 @@ next_one:
                timer = rb_entry(curr, struct hrtimer, node);
                tmp = *timer;
-                spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+                raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
                print_timer(m, timer, &tmp, i, now);
                next++;
                goto next_one;
        }
-        spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+        raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
 }
 static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        P_ns(expires_next);
        P(hres_active);
        P(nr_events);
+        P(nr_retries);
+        P(nr_hangs);
+        P_ns(max_hang_time);
 #endif
 #undef P
 #undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                return;
        }
        SEQ_printf(m, "%s\n", dev->name);
-        SEQ_printf(m, " max_delta_ns:   %lu\n", dev->max_delta_ns);
+        SEQ_printf(m, " max_delta_ns:   %llu\n",
-        SEQ_printf(m, " min_delta_ns:   %lu\n", dev->min_delta_ns);
+                   (unsigned long long) dev->max_delta_ns);
-        SEQ_printf(m, " mult:           %lu\n", dev->mult);
+        SEQ_printf(m, " min_delta_ns:   %llu\n",
-        SEQ_printf(m, " shift:          %d\n", dev->shift);
+                   (unsigned long long) dev->min_delta_ns);
+        SEQ_printf(m, " mult:           %u\n", dev->mult);
+        SEQ_printf(m, " shift:          %u\n", dev->shift);
        SEQ_printf(m, " mode:           %d\n", dev->mode);
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
@@ -232,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
        print_tickdevice(m, tick_get_broadcast_device(), -1);
        SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
-                   tick_get_broadcast_mask()->bits[0]);
+                   cpumask_bits(tick_get_broadcast_mask())[0]);
 #ifdef CONFIG_TICK_ONESHOT
        SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
-                   tick_get_broadcast_oneshot_mask()->bits[0]);
+                   cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
 #endif
        SEQ_printf(m, "\n");
 #endif
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.4\n");
+        SEQ_printf(m, "Timer List Version: v0.5\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7e..2f3b585b8d7 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
 /*
 * Per-CPU lookup locks for fast hash lookup:
 */
-static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
 /*
 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        /*
         * It doesnt matter which lock we take:
         */
-        spinlock_t *lock;
+        raw_spinlock_t *lock;
        struct entry *entry, input;
        unsigned long flags;
        if (likely(!timer_stats_active))
                return;
-        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+        lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
        input.timer = timer;
        input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        input.pid = pid;
        input.timer_flag = timer_flag;
-        spin_lock_irqsave(lock, flags);
+        raw_spin_lock_irqsave(lock, flags);
        if (!timer_stats_active)
                goto out_unlock;
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
                atomic_inc(&overflow_count);
 out_unlock:
-        spin_unlock_irqrestore(lock, flags);
+        raw_spin_unlock_irqrestore(lock, flags);
 }
 static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
        int cpu;
        for_each_online_cpu(cpu) {
-                spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+                raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
+                raw_spin_lock_irqsave(lock, flags);
                /* nothing */
-                spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+                raw_spin_unlock_irqrestore(lock, flags);
        }
 }
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
        int cpu;
        for_each_possible_cpu(cpu)
-                spin_lock_init(&per_cpu(lookup_lock, cpu));
+                raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
 }
 static int __init init_tstats_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d2681..c61a7949387 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        debug_activate(timer, expires);
-        new_base = __get_cpu_var(tvec_bases);
        cpu = smp_processor_id();
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -1200,6 +1198,7 @@ void update_process_times(int user_tick)
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
        printk_tick();
+        perf_event_do_pending();
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
@@ -1211,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
        struct tvec_base *base = __get_cpu_var(tvec_bases);
-        perf_event_do_pending();
        hrtimer_run_pending();
        if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d006554888d..60e2ce0181e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
 config HAVE_FTRACE_NMI_ENTER
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FUNCTION_TRACER
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FUNCTION_GRAPH_TRACER
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FUNCTION_GRAPH_FP_TEST
        bool
        help
-         An arch may pass in a unique value (frame pointer) to both the
+          See Documentation/trace/ftrace-design.txt
-         entering and exiting of a function. On exit, the value is compared
-         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_DYNAMIC_FTRACE
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FTRACE_MCOUNT_RECORD
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_HW_BRANCH_TRACER
        bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
 config HAVE_SYSCALL_TRACEPOINTS
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config TRACER_MAX_TRACE
        bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
 # This allows those options to appear when no other tracer is selected. But the
 # options do not appear when something else selects it. We need the two options
 # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
-# hidding of the automatic options.
+# hiding of the automatic options.
 config TRACING
        bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
        bool "Tracers"
        default y if DEBUG_KERNEL
        help
-         Enable the kernel tracing infrastructure.
+          Enable the kernel tracing infrastructure.
 if FTRACE
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
        help
          Enable the kernel to trace every kernel function. This is done
          by using a compiler feature to insert a small, 5-byte No-Operation
-          instruction to the beginning of every kernel function, which NOP
+          instruction at the beginning of every kernel function, which NOP
          sequence is then dynamically patched into a tracer call when
          tracing is enabled by the administrator. If it's runtime disabled
          (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
          and its entry.
          Its first purpose is to trace the duration of functions and
          draw a call graph for each thread with some information like
-          the return value. This is done by setting the current return 
+          the return value. This is done by setting the current return
          address on the current task structure into a stack of calls.
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
-          (Note that kernel size and overhead increases with this option
+          (Note that kernel size and overhead increase with this option
          enabled. This option and the preempt-off timing option can be
          used together or separately.)
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
        select TRACER_MAX_TRACE
        select RING_BUFFER_ALLOW_SWAP
        help
-          This option measures the time spent in preemption off critical
+          This option measures the time spent in preemption-off critical
          sections, with microsecond accuracy.
          The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
-          (Note that kernel size and overhead increases with this option
+          (Note that kernel size and overhead increase with this option
          enabled. This option and the irqs-off timing option can be
          used together or separately.)
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
        depends on !GENERIC_TRACER
        select TRACING
        help
-          This tracer hooks to various trace points in the kernel
+          This tracer hooks to various trace points in the kernel,
          allowing the user to pick and choose which trace point they
          want to trace. It also includes the sched_switch tracer plugin.
@@ -265,19 +263,19 @@ choice
         The likely/unlikely profiler only looks at the conditions that
         are annotated with a likely or unlikely macro.
-         The "all branch" profiler will profile every if statement in the
+         The "all branch" profiler will profile every if-statement in the
         kernel. This profiler will also enable the likely/unlikely
-         profiler as well.
+         profiler.
-         Either of the above profilers add a bit of overhead to the system.
+         Either of the above profilers adds a bit of overhead to the system.
-         If unsure choose "No branch profiling".
+         If unsure, choose "No branch profiling".
 config BRANCH_PROFILE_NONE
        bool "No branch profiling"
        help
-         No branch profiling. Branch profiling adds a bit of overhead.
+          No branch profiling. Branch profiling adds a bit of overhead.
-         Only enable it if you want to analyse the branching behavior.
+          Only enable it if you want to analyse the branching behavior.
-         Otherwise keep it disabled.
+          Otherwise keep it disabled.
 config PROFILE_ANNOTATED_BRANCHES
        bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
          /sys/kernel/debug/tracing/profile_annotated_branch
-          Note: this will add a significant overhead, only turn this
+          Note: this will add a significant overhead; only turn this
          on if you need to profile the system's use of these macros.
 config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
          This configuration, when enabled, will impose a great overhead
          on the system. This should only be enabled when the system
-          is to be analyzed
+          is to be analyzed in much detail.
 endchoice
 config TRACING_BRANCHES
@@ -335,7 +333,7 @@ config POWER_TRACER
        depends on X86
        select GENERIC_TRACER
        help
-          This tracer helps developers to analyze and optimize the kernels
+          This tracer helps developers to analyze and optimize the kernel's
          power management decisions, specifically the C-state and P-state
          behavior.
@@ -391,14 +389,14 @@ config HW_BRANCH_TRACER
        select GENERIC_TRACER
        help
          This tracer records all branches on the system in a circular
-          buffer giving access to the last N branches for each cpu.
+          buffer, giving access to the last N branches for each cpu.
 config KMEMTRACE
        bool "Trace SLAB allocations"
        select GENERIC_TRACER
        help
          kmemtrace provides tracing for slab allocator functions, such as
-          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
          data is then fed to the userspace application in order to analyse
          allocation hotspots, internal fragmentation and so on, making it
          possible to see how well an allocator performs, as well as debug
@@ -417,15 +415,15 @@ config WORKQUEUE_TRACER
        bool "Trace workqueues"
        select GENERIC_TRACER
        help
-          The workqueue tracer provides some statistical informations
+          The workqueue tracer provides some statistical information
          about each cpu workqueue thread such as the number of the
          works inserted and executed since their creation. It can help
-          to evaluate the amount of work each of them have to perform.
+          to evaluate the amount of work each of them has to perform.
          For example it can help a developer to decide whether he should
-          choose a per cpu workqueue instead of a singlethreaded one.
+          choose a per-cpu workqueue instead of a singlethreaded one.
 config BLK_DEV_IO_TRACE
-        bool "Support for tracing block io actions"
+        bool "Support for tracing block IO actions"
        depends on SYSFS
        depends on BLOCK
        select RELAY
@@ -456,15 +454,15 @@ config KPROBE_EVENT
        select TRACING
        default y
        help
-          This allows the user to add tracing events (similar to tracepoints) on the fly
+          This allows the user to add tracing events (similar to tracepoints)
-          via the ftrace interface. See Documentation/trace/kprobetrace.txt
+          on the fly via the ftrace interface. See
-          for more details.
+          Documentation/trace/kprobetrace.txt for more details.
          Those events can be inserted wherever kprobes can probe, and record
          various register and memory values.
-          This option is also required by perf-probe subcommand of perf tools. If
+          This option is also required by perf-probe subcommand of perf tools.
-          you want to use perf tools, this option is strongly recommended.
+          If you want to use perf tools, this option is strongly recommended.
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
@@ -472,32 +470,32 @@ config DYNAMIC_FTRACE
        depends on HAVE_DYNAMIC_FTRACE
        default y
        help
-         This option will modify all the calls to ftrace dynamically
+          This option will modify all the calls to ftrace dynamically
-         (will patch them out of the binary image and replaces them
+          (will patch them out of the binary image and replace them
-         with a No-Op instruction) as they are called. A table is
+          with a No-Op instruction) as they are called. A table is
-         created to dynamically enable them again.
+          created to dynamically enable them again.
-         This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
+          This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
-         has native performance as long as no tracing is active.
+          otherwise has native performance as long as no tracing is active.
-         The changes to the code are done by a kernel thread that
+          The changes to the code are done by a kernel thread that
-         wakes up once a second and checks to see if any ftrace calls
+          wakes up once a second and checks to see if any ftrace calls
-         were made. If so, it runs stop_machine (stops all CPUS)
+          were made. If so, it runs stop_machine (stops all CPUS)
-         and modifies the code to jump over the call to ftrace.
+          and modifies the code to jump over the call to ftrace.
 config FUNCTION_PROFILER
        bool "Kernel function profiler"
        depends on FUNCTION_TRACER
        default n
        help
-         This option enables the kernel function profiler. A file is created
+          This option enables the kernel function profiler. A file is created
-         in debugfs called function_profile_enabled which defaults to zero.
+          in debugfs called function_profile_enabled which defaults to zero.
-         When a 1 is echoed into this file profiling begins, and when a
+          When a 1 is echoed into this file profiling begins, and when a
-         zero is entered, profiling stops. A file in the trace_stats
+          zero is entered, profiling stops. A "functions" file is created in
-         directory called functions, that show the list of functions that
+          the trace_stats directory; this file shows the list of functions that
-         have been hit and their counters.
+          have been hit and their counters.
-         If in doubt, say N
+          If in doubt, say N.
 config FTRACE_MCOUNT_RECORD
        def_bool y
@@ -556,8 +554,8 @@ config RING_BUFFER_BENCHMARK
        tristate "Ring buffer benchmark stress tester"
        depends on RING_BUFFER
        help
-          This option creates a test to stress the ring buffer and bench mark it.
+          This option creates a test to stress the ring buffer and benchmark it.
-          It creates its own ring buffer such that it will not interfer with
+          It creates its own ring buffer such that it will not interfere with
          any other users of the ring buffer (such as ftrace). It then creates
          a producer and consumer that will run for 10 seconds and sleep for
          10 seconds. Each interval it will print out the number of events
@@ -566,7 +564,7 @@ config RING_BUFFER_BENCHMARK
          It does not disable interrupts or raise its priority, so it may be
          affected by processes that are running.
-          If unsure, say N
+          If unsure, say N.
 endif # FTRACE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e51a1bcb7be..1e6640f8045 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1690,7 +1690,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
        int matched = 0;
-        char *ptr;
+        int slen;
        switch (type) {
        case MATCH_FULL:
@@ -1706,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
                        matched = 1;
                break;
        case MATCH_END_ONLY:
-                ptr = strstr(str, regex);
+                slen = strlen(str);
-                if (ptr && (ptr[len] == 0))
+                if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
                        matched = 1;
                break;
        }
@@ -1724,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
        return ftrace_match(str, regex, len, type);
 }
-static void ftrace_match_records(char *buff, int len, int enable)
+static int ftrace_match_records(char *buff, int len, int enable)
 {
        unsigned int search_len;
        struct ftrace_page *pg;
@@ -1733,6 +1733,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
        char *search;
        int type;
        int not;
+        int found = 0;
        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
        type = filter_parse_regex(buff, len, &search, &not);
@@ -1750,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
                                rec->flags &= ~flag;
                        else
                                rec->flags |= flag;
+                        found = 1;
                }
                /*
                 * Only enable filtering if we have a function that
@@ -1759,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
                        ftrace_filtered = 1;
        } while_for_each_ftrace_rec();
        mutex_unlock(&ftrace_lock);
+        return found;
 }
 static int
@@ -1780,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
                return 1;
 }
-static void ftrace_match_module_records(char *buff, char *mod, int enable)
+static int ftrace_match_module_records(char *buff, char *mod, int enable)
 {
        unsigned search_len = 0;
        struct ftrace_page *pg;
@@ -1789,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
        char *search = buff;
        unsigned long flag;
        int not = 0;
+        int found = 0;
        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
@@ -1819,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
                                rec->flags &= ~flag;
                        else
                                rec->flags |= flag;
+                        found = 1;
                }
                if (enable && (rec->flags & FTRACE_FL_FILTER))
                        ftrace_filtered = 1;
        } while_for_each_ftrace_rec();
        mutex_unlock(&ftrace_lock);
+        return found;
 }
 /*
@@ -1853,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
        if (!strlen(mod))
                return -EINVAL;
-        ftrace_match_module_records(func, mod, enable);
+        if (ftrace_match_module_records(func, mod, enable))
-        return 0;
+                return 0;
+        return -EINVAL;
 }
 static struct ftrace_func_command ftrace_mod_cmd = {
@@ -2151,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
        func = strsep(&next, ":");
        if (!next) {
-                ftrace_match_records(func, len, enable);
+                if (ftrace_match_records(func, len, enable))
-                return 0;
+                        return 0;
+                return ret;
        }
        /* command found */
@@ -2198,10 +2208,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
            !trace_parser_cont(parser)) {
                ret = ftrace_process_regex(parser->buffer,
                                           parser->idx, enable);
+                trace_parser_clear(parser);
                if (ret)
                        goto out_unlock;
-                trace_parser_clear(parser);
        }
        ret = read;
@@ -2543,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
                                        exists = true;
                                        break;
                                }
-                        if (!exists) {
+                        if (!exists)
                                array[(*idx)++] = rec->ip;
-                                found = 1;
+                        found = 1;
-                        }
                }
        } while_for_each_ftrace_rec();
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a..9f4f565b01e 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -14,7 +14,5 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
 EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a1ca4956ab5..8c1b2d29071 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -423,7 +423,7 @@ struct ring_buffer_per_cpu {
        int                             cpu;
        struct ring_buffer              *buffer;
        spinlock_t                      reader_lock;    /* serialize readers */
-        raw_spinlock_t                  lock;
+        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
        struct list_head                *pages;
        struct buffer_page              *head_page;     /* read from head */
@@ -464,6 +464,8 @@ struct ring_buffer_iter {
        struct ring_buffer_per_cpu      *cpu_buffer;
        unsigned long                   head;
        struct buffer_page              *head_page;
+        struct buffer_page              *cache_reader_page;
+        unsigned long                   cache_read;
        u64                             read_stamp;
 };
@@ -998,7 +1000,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        cpu_buffer->buffer = buffer;
        spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
-        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
@@ -1193,9 +1195,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        struct list_head *p;
        unsigned i;
-        atomic_inc(&cpu_buffer->record_disabled);
-        synchronize_sched();
        spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
@@ -1211,12 +1210,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
                return;
        rb_reset_cpu(cpu_buffer);
-        spin_unlock_irq(&cpu_buffer->reader_lock);
        rb_check_pages(cpu_buffer);
-        atomic_dec(&cpu_buffer->record_disabled);
+        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 static void
@@ -1227,9 +1223,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        struct list_head *p;
        unsigned i;
-        atomic_inc(&cpu_buffer->record_disabled);
-        synchronize_sched();
        spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
@@ -1242,11 +1235,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
                list_add_tail(&bpage->list, cpu_buffer->pages);
        }
        rb_reset_cpu(cpu_buffer);
-        spin_unlock_irq(&cpu_buffer->reader_lock);
        rb_check_pages(cpu_buffer);
-        atomic_dec(&cpu_buffer->record_disabled);
+        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 /**
@@ -1254,11 +1245,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 * @buffer: the buffer to resize.
 * @size: the new size.
 *
- * The tracer is responsible for making sure that the buffer is
- * not being used while changing the size.
- * Note: We may be able to change the above requirement by using
- *  RCU synchronizations.
- *
 * Minimum size is 2 * BUF_PAGE_SIZE.
 *
 * Returns -1 on failure.
@@ -1290,6 +1276,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        if (size == buffer_size)
                return size;
+        atomic_inc(&buffer->record_disabled);
+        /* Make sure all writers are done with this buffer. */
+        synchronize_sched();
        mutex_lock(&buffer->mutex);
        get_online_cpus();
@@ -1352,6 +1343,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        put_online_cpus();
        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
        return size;
 free_pages:
@@ -1361,6 +1354,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        }
        put_online_cpus();
        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
        return -ENOMEM;
        /*
@@ -1370,6 +1364,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 out_fail:
        put_online_cpus();
        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
        return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -2723,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
+        iter->cache_reader_page = cpu_buffer->reader_page;
+        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -2834,7 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        int ret;
        local_irq_save(flags);
-        __raw_spin_lock(&cpu_buffer->lock);
+        arch_spin_lock(&cpu_buffer->lock);
 again:
        /*
@@ -2876,7 +2873,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         * Splice the empty reader page into the list around the head.
         */
        reader = rb_set_head_page(cpu_buffer);
-        cpu_buffer->reader_page->list.next = reader->list.next;
+        cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
        cpu_buffer->reader_page->list.prev = reader->list.prev;
        /*
@@ -2913,7 +2910,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         *
         * Now make the new head point back to the reader page.
         */
-        reader->list.next->prev = &cpu_buffer->reader_page->list;
+        rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
        rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
        /* Finally update the reader page to the new head */
@@ -2923,7 +2920,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        goto again;
 out:
-        __raw_spin_unlock(&cpu_buffer->lock);
+        arch_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
        return reader;
@@ -3067,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_event *event;
        int nr_loops = 0;
-        if (ring_buffer_iter_empty(iter))
-                return NULL;
        cpu_buffer = iter->cpu_buffer;
        buffer = cpu_buffer->buffer;
+        /*
+         * Check if someone performed a consuming read to
+         * the buffer. A consuming read invalidates the iterator
+         * and we need to reset the iterator in this case.
+         */
+        if (unlikely(iter->cache_read != cpu_buffer->read ||
+                     iter->cache_reader_page != cpu_buffer->reader_page))
+                rb_iter_reset(iter);
 again:
+        if (ring_buffer_iter_empty(iter))
+                return NULL;
        /*
         * We repeat when a timestamp is encountered.
         * We can get multiple timestamps by nested interrupts or also
@@ -3088,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
+        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+                rb_inc_iter(iter);
+                goto again;
+        }
        event = rb_iter_head_event(iter);
        switch (event->type_len) {
@@ -3286,9 +3297,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
        synchronize_sched();
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-        __raw_spin_lock(&cpu_buffer->lock);
+        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
-        __raw_spin_unlock(&cpu_buffer->lock);
+        arch_spin_unlock(&cpu_buffer->lock);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        return iter;
@@ -3408,11 +3419,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
                goto out;
-        __raw_spin_lock(&cpu_buffer->lock);
+        arch_spin_lock(&cpu_buffer->lock);
        rb_reset_cpu(cpu_buffer);
-        __raw_spin_unlock(&cpu_buffer->lock);
+        arch_spin_unlock(&cpu_buffer->lock);
 out:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 874f2893cff..eac6875cb99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
 *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/ring_buffer.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
 #include <linux/stacktrace.h>
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
@@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 */
 static int tracing_disabled = 1;
-DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
        preempt_disable();
-        local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
 }
 static inline void ftrace_enable_cpu(void)
 {
-        local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
        preempt_enable();
 }
@@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu)
 */
 static struct trace_array       max_tr;
-static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int                      tracer_enabled = 1;
@@ -313,7 +313,6 @@ static const char *trace_options[] = {
        "bin",
        "block",
        "stacktrace",
-        "sched-tree",
        "trace_printk",
        "ftrace_preempt",
        "branch",
@@ -493,15 +492,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 * protected by per_cpu spinlocks. But the action of the swap
 * needs its own lock.
 *
- * This is defined as a raw_spinlock_t in order to help
+ * This is defined as a arch_spinlock_t in order to help
 * with performance when lockdep debugging is enabled.
 *
 * It is also used in other places outside the update_max_tr
 * so it needs to be defined outside of the
 * CONFIG_TRACER_MAX_TRACE.
 */
-static raw_spinlock_t ftrace_max_lock =
+static arch_spinlock_t ftrace_max_lock =
-        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 #ifdef CONFIG_TRACER_MAX_TRACE
 unsigned long __read_mostly     tracing_max_latency;
@@ -555,13 +554,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        tr->buffer = max_tr.buffer;
        max_tr.buffer = buf;
        __update_max_tr(tr, tsk, cpu);
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
 }
 /**
@@ -581,7 +580,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        ftrace_disable_cpu();
@@ -603,7 +602,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
        __update_max_tr(tr, tsk, cpu);
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
@@ -802,7 +801,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
-static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 /* temporary disable recording */
 static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -915,7 +914,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
         * nor do we want to disable interrupts,
         * so if we miss here, then better luck next time.
         */
-        if (!__raw_spin_trylock(&trace_cmdline_lock))
+        if (!arch_spin_trylock(&trace_cmdline_lock))
                return;
        idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +939,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
        memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
-        __raw_spin_unlock(&trace_cmdline_lock);
+        arch_spin_unlock(&trace_cmdline_lock);
 }
 void trace_find_cmdline(int pid, char comm[])
@@ -952,20 +951,25 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        if (WARN_ON_ONCE(pid < 0)) {
+                strcpy(comm, "<XXX>");
+                return;
+        }
        if (pid > PID_MAX_DEFAULT) {
                strcpy(comm, "<...>");
                return;
        }
        preempt_disable();
-        __raw_spin_lock(&trace_cmdline_lock);
+        arch_spin_lock(&trace_cmdline_lock);
        map = map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
                strcpy(comm, saved_cmdlines[map]);
        else
                strcpy(comm, "<...>");
-        __raw_spin_unlock(&trace_cmdline_lock);
+        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
 }
@@ -1085,7 +1089,7 @@ trace_function(struct trace_array *tr,
        struct ftrace_entry *entry;
        /* If we are reading the ring buffer, don't trace */
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1155,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
        __ftrace_trace_stack(tr->buffer, flags, skip, pc);
 }
+/**
+ * trace_dump_stack - record a stack back trace in the trace buffer
+ */
+void trace_dump_stack(void)
+{
+        unsigned long flags;
+        if (tracing_disabled || tracing_selftest_running)
+                return;
+        local_save_flags(flags);
+        /* skipping 3 traces, seems to get us at the caller of this function */
+        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+}
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1251,8 +1271,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
-        static raw_spinlock_t trace_buf_lock =
+        static arch_spinlock_t trace_buf_lock =
-                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        static u32 trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1303,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        /* Lockdep uses trace_printk for lock tracing */
        local_irq_save(flags);
-        __raw_spin_lock(&trace_buf_lock);
+        arch_spin_lock(&trace_buf_lock);
        len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
        if (len > TRACE_BUF_SIZE || len < 0)
@@ -1304,7 +1324,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
                ring_buffer_unlock_commit(buffer, event);
 out_unlock:
-        __raw_spin_unlock(&trace_buf_lock);
+        arch_spin_unlock(&trace_buf_lock);
        local_irq_restore(flags);
 out:
@@ -1334,7 +1354,7 @@ int trace_array_printk(struct trace_array *tr,
 int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
 {
-        static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+        static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
        static char trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1380,8 @@ int trace_array_vprintk(struct trace_array *tr,
        pause_graph_tracing();
        raw_local_irq_save(irq_flags);
-        __raw_spin_lock(&trace_buf_lock);
+        arch_spin_lock(&trace_buf_lock);
-        if (args == NULL) {
+        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-                strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
-                len = strlen(trace_buf);
-        } else
-                len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
@@ -1382,7 +1398,7 @@ int trace_array_vprintk(struct trace_array *tr,
                ring_buffer_unlock_commit(buffer, event);
 out_unlock:
-        __raw_spin_unlock(&trace_buf_lock);
+        arch_spin_unlock(&trace_buf_lock);
        raw_local_irq_restore(irq_flags);
        unpause_graph_tracing();
 out:
@@ -1516,6 +1532,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
        int i = (int)*pos;
        void *ent;
+        WARN_ON_ONCE(iter->leftover);
        (*pos)++;
        /* can't go backwards */
@@ -1614,8 +1632,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                        ;
        } else {
-                l = *pos - 1;
+                /*
-                p = s_next(m, p, &l);
+                 * If we overflowed the seq_file before, then we want
+                 * to just reuse the trace_seq buffer again.
+                 */
+                if (iter->leftover)
+                        p = iter;
+                else {
+                        l = *pos - 1;
+                        p = s_next(m, p, &l);
+                }
        }
        trace_event_read_lock();
@@ -1923,6 +1949,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
+        int ret;
        if (iter->ent == NULL) {
                if (iter->tr) {
@@ -1942,9 +1969,27 @@ static int s_show(struct seq_file *m, void *v)
                        if (!(trace_flags & TRACE_ITER_VERBOSE))
                                print_func_help_header(m);
                }
+        } else if (iter->leftover) {
+                /*
+                 * If we filled the seq_file buffer earlier, we
+                 * want to just show it now.
+                 */
+                ret = trace_print_seq(m, &iter->seq);
+                /* ret should this time be zero, but you never know */
+                iter->leftover = ret;
        } else {
                print_trace_line(iter);
-                trace_print_seq(m, &iter->seq);
+                ret = trace_print_seq(m, &iter->seq);
+                /*
+                 * If we overflow the seq_file buffer, then it will
+                 * ask us for this data again at start up.
+                 * Use that instead.
+                 *  ret is 0 if seq_file write succeeded.
+                 *        -1 otherwise.
+                 */
+                iter->leftover = ret;
        }
        return 0;
@@ -2254,7 +2299,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                 * Increase/decrease the disabled counter if we are
@@ -2269,7 +2314,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                        atomic_dec(&global_trace.data[cpu]->disabled);
                }
        }
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
        local_irq_enable();
        cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2291,67 +2336,49 @@ static const struct file_operations tracing_cpumask_fops = {
        .write          = tracing_cpumask_write,
 };
-static ssize_t
+static int tracing_trace_options_show(struct seq_file *m, void *v)
-tracing_trace_options_read(struct file *filp, char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
 {
        struct tracer_opt *trace_opts;
        u32 tracer_flags;
-        int len = 0;
-        char *buf;
-        int r = 0;
        int i;
-        /* calculate max size */
-        for (i = 0; trace_options[i]; i++) {
-                len += strlen(trace_options[i]);
-                len += 3; /* "no" and newline */
-        }
        mutex_lock(&trace_types_lock);
        tracer_flags = current_trace->flags->val;
        trace_opts = current_trace->flags->opts;
-        /*
-         * Increase the size with names of options specific
-         * of the current tracer.
-         */
-        for (i = 0; trace_opts[i].name; i++) {
-                len += strlen(trace_opts[i].name);
-                len += 3; /* "no" and newline */
-        }
-        /* +1 for \0 */
-        buf = kmalloc(len + 1, GFP_KERNEL);
-        if (!buf) {
-                mutex_unlock(&trace_types_lock);
-                return -ENOMEM;
-        }
        for (i = 0; trace_options[i]; i++) {
                if (trace_flags & (1 << i))
-                        r += sprintf(buf + r, "%s\n", trace_options[i]);
+                        seq_printf(m, "%s\n", trace_options[i]);
                else
-                        r += sprintf(buf + r, "no%s\n", trace_options[i]);
+                        seq_printf(m, "no%s\n", trace_options[i]);
        }
        for (i = 0; trace_opts[i].name; i++) {
                if (tracer_flags & trace_opts[i].bit)
-                        r += sprintf(buf + r, "%s\n",
+                        seq_printf(m, "%s\n", trace_opts[i].name);
-                                trace_opts[i].name);
                else
-                        r += sprintf(buf + r, "no%s\n",
+                        seq_printf(m, "no%s\n", trace_opts[i].name);
-                                trace_opts[i].name);
        }
        mutex_unlock(&trace_types_lock);
-        WARN_ON(r >= len + 1);
+        return 0;
+}
-        r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+static int __set_tracer_option(struct tracer *trace,
+                               struct tracer_flags *tracer_flags,
+                               struct tracer_opt *opts, int neg)
+{
+        int ret;
-        kfree(buf);
+        ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
-        return r;
+        if (ret)
+                return ret;
+        if (neg)
+                tracer_flags->val &= ~opts->bit;
+        else
+                tracer_flags->val |= opts->bit;
+        return 0;
 }
 /* Try to assign a tracer specific option */
@@ -2359,33 +2386,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 {
        struct tracer_flags *tracer_flags = trace->flags;
        struct tracer_opt *opts = NULL;
-        int ret = 0, i = 0;
+        int i;
-        int len;
        for (i = 0; tracer_flags->opts[i].name; i++) {
                opts = &tracer_flags->opts[i];
-                len = strlen(opts->name);
-                if (strncmp(cmp, opts->name, len) == 0) {
+                if (strcmp(cmp, opts->name) == 0)
-                        ret = trace->set_flag(tracer_flags->val,
+                        return __set_tracer_option(trace, trace->flags,
-                                opts->bit, !neg);
+                                                   opts, neg);
-                        break;
-                }
        }
-        /* Not found */
-        if (!tracer_flags->opts[i].name)
-                return -EINVAL;
-        /* Refused to handle */
-        if (ret)
-                return ret;
-        if (neg)
-                tracer_flags->val &= ~opts->bit;
-        else
-                tracer_flags->val |= opts->bit;
-        return 0;
+        return -EINVAL;
 }
 static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2405,7 +2416,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int ret;
        int i;
@@ -2417,16 +2428,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
-        if (strncmp(buf, "no", 2) == 0) {
+        if (strncmp(cmp, "no", 2) == 0) {
                neg = 1;
                cmp += 2;
        }
        for (i = 0; trace_options[i]; i++) {
-                int len = strlen(trace_options[i]);
+                if (strcmp(cmp, trace_options[i]) == 0) {
-                if (strncmp(cmp, trace_options[i], len) == 0) {
                        set_tracer_flags(1 << i, !neg);
                        break;
                }
@@ -2446,9 +2456,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int tracing_trace_options_open(struct inode *inode, struct file *file)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return single_open(file, tracing_trace_options_show, NULL);
+}
 static const struct file_operations tracing_iter_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_trace_options_open,
-        .read           = tracing_trace_options_read,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
        .write          = tracing_trace_options_write,
 };
@@ -2898,6 +2917,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        else
                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+        if (iter->trace->pipe_close)
+                iter->trace->pipe_close(iter);
        mutex_unlock(&trace_types_lock);
        free_cpumask_var(iter->started);
@@ -3104,7 +3127,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
        __free_page(spd->pages[idx]);
 }
-static struct pipe_buf_operations tracing_pipe_buf_ops = {
+static const struct pipe_buf_operations tracing_pipe_buf_ops = {
        .can_merge              = 0,
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
@@ -3320,6 +3343,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int mark_printk(const char *fmt, ...)
+{
+        int ret;
+        va_list args;
+        va_start(args, fmt);
+        ret = trace_vprintk(0, fmt, args);
+        va_end(args);
+        return ret;
+}
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
@@ -3346,28 +3379,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        } else
                buf[cnt] = '\0';
-        cnt = trace_vprintk(0, buf, NULL);
+        cnt = mark_printk("%s", buf);
        kfree(buf);
        *fpos += cnt;
        return cnt;
 }
-static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
+static int tracing_clock_show(struct seq_file *m, void *v)
-                                  size_t cnt, loff_t *ppos)
 {
-        char buf[64];
-        int bufiter = 0;
        int i;
        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
-                bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
+                seq_printf(m,
                        "%s%s%s%s", i ? " " : "",
                        i == trace_clock_id ? "[" : "", trace_clocks[i].name,
                        i == trace_clock_id ? "]" : "");
-        bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
+        seq_putc(m, '\n');
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
+        return 0;
 }
 static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3409,6 +3439,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int tracing_clock_open(struct inode *inode, struct file *file)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return single_open(file, tracing_clock_show, NULL);
+}
 static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@ -3447,8 +3484,10 @@ static const struct file_operations tracing_mark_fops = {
 };
 static const struct file_operations trace_clock_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_clock_open,
-        .read           = tracing_clock_read,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
        .write          = tracing_clock_write,
 };
@@ -3578,7 +3617,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
 }
 /* Pipe buffer operations for a buffer. */
-static struct pipe_buf_operations buffer_pipe_buf_ops = {
+static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .can_merge              = 0,
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
@@ -3909,39 +3948,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (ret < 0)
                return ret;
-        ret = 0;
+        if (val != 0 && val != 1)
-        switch (val) {
+                return -EINVAL;
-        case 0:
-                /* do nothing if already cleared */
-                if (!(topt->flags->val & topt->opt->bit))
-                        break;
-                mutex_lock(&trace_types_lock);
-                if (current_trace->set_flag)
-                        ret = current_trace->set_flag(topt->flags->val,
-                                                      topt->opt->bit, 0);
-                mutex_unlock(&trace_types_lock);
-                if (ret)
-                        return ret;
-                topt->flags->val &= ~topt->opt->bit;
-                break;
-        case 1:
-                /* do nothing if already set */
-                if (topt->flags->val & topt->opt->bit)
-                        break;
+        if (!!(topt->flags->val & topt->opt->bit) != val) {
                mutex_lock(&trace_types_lock);
-                if (current_trace->set_flag)
+                ret = __set_tracer_option(current_trace, topt->flags,
-                        ret = current_trace->set_flag(topt->flags->val,
+                                          topt->opt, !val);
-                                                      topt->opt->bit, 1);
                mutex_unlock(&trace_types_lock);
                if (ret)
                        return ret;
-                topt->flags->val |= topt->opt->bit;
-                break;
-        default:
-                return -EINVAL;
        }
        *ppos += cnt;
@@ -4268,8 +4284,8 @@ trace_printk_seq(struct trace_seq *s)
 static void __ftrace_dump(bool disable_tracing)
 {
-        static raw_spinlock_t ftrace_dump_lock =
+        static arch_spinlock_t ftrace_dump_lock =
-                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        unsigned int old_userobj;
@@ -4279,7 +4295,7 @@ static void __ftrace_dump(bool disable_tracing)
        /* only one dump */
        local_irq_save(flags);
-        __raw_spin_lock(&ftrace_dump_lock);
+        arch_spin_lock(&ftrace_dump_lock);
        if (dump_ran)
                goto out;
@@ -4354,7 +4370,7 @@ static void __ftrace_dump(bool disable_tracing)
        }
 out:
-        __raw_spin_unlock(&ftrace_dump_lock);
+        arch_spin_unlock(&ftrace_dump_lock);
        local_irq_restore(flags);
 }
@@ -4415,7 +4431,7 @@ __init static int tracer_alloc_buffers(void)
        /* Allocate the first page for all buffers */
        for_each_tracing_cpu(i) {
                global_trace.data[i] = &per_cpu(global_trace_cpu, i);
-                max_tr.data[i] = &per_cpu(max_data, i);
+                max_tr.data[i] = &per_cpu(max_tr_data, i);
        }
        trace_init_cmdlines();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1d7f4830a80..4df6a77eb19 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,6 +272,7 @@ struct tracer_flags {
 * @pipe_open: called when the trace_pipe file is opened
 * @wait_pipe: override how the user waits for traces on trace_pipe
 * @close: called when the trace file is released
+ * @pipe_close: called when the trace_pipe file is released
 * @read: override the default read callback on trace_pipe
 * @splice_read: override the default splice_read callback on trace_pipe
 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -290,6 +291,7 @@ struct tracer {
        void                    (*pipe_open)(struct trace_iterator *iter);
        void                    (*wait_pipe)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
+        void                    (*pipe_close)(struct trace_iterator *iter);
        ssize_t                 (*read)(struct trace_iterator *iter,
                                        struct file *filp, char __user *ubuf,
                                        size_t cnt, loff_t *ppos);
@@ -441,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
 extern int ring_buffer_expanded;
 extern bool tracing_selftest_disabled;
-DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
+DECLARE_PER_CPU(int, ftrace_cpu_disabled);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
@@ -595,18 +597,17 @@ enum trace_iterator_flags {
        TRACE_ITER_BIN                  = 0x40,
        TRACE_ITER_BLOCK                = 0x80,
        TRACE_ITER_STACKTRACE           = 0x100,
-        TRACE_ITER_SCHED_TREE           = 0x200,
+        TRACE_ITER_PRINTK               = 0x200,
-        TRACE_ITER_PRINTK               = 0x400,
+        TRACE_ITER_PREEMPTONLY          = 0x400,
-        TRACE_ITER_PREEMPTONLY          = 0x800,
+        TRACE_ITER_BRANCH               = 0x800,
-        TRACE_ITER_BRANCH               = 0x1000,
+        TRACE_ITER_ANNOTATE             = 0x1000,
-        TRACE_ITER_ANNOTATE             = 0x2000,
+        TRACE_ITER_USERSTACKTRACE       = 0x2000,
-        TRACE_ITER_USERSTACKTRACE       = 0x4000,
+        TRACE_ITER_SYM_USEROBJ          = 0x4000,
-        TRACE_ITER_SYM_USEROBJ          = 0x8000,
+        TRACE_ITER_PRINTK_MSGONLY       = 0x8000,
-        TRACE_ITER_PRINTK_MSGONLY       = 0x10000,
+        TRACE_ITER_CONTEXT_INFO         = 0x10000, /* Print pid/cpu/time */
-        TRACE_ITER_CONTEXT_INFO         = 0x20000, /* Print pid/cpu/time */
+        TRACE_ITER_LATENCY_FMT          = 0x20000,
-        TRACE_ITER_LATENCY_FMT          = 0x40000,
+        TRACE_ITER_SLEEP_TIME           = 0x40000,
-        TRACE_ITER_SLEEP_TIME           = 0x80000,
+        TRACE_ITER_GRAPH_TIME           = 0x80000,
-        TRACE_ITER_GRAPH_TIME           = 0x100000,
 };
 /*
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 878c03f386b..84a3a7ba072 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -71,10 +71,10 @@ u64 notrace trace_clock(void)
 /* keep prev_time and lock in the same cacheline. */
 static struct {
        u64 prev_time;
-        raw_spinlock_t lock;
+        arch_spinlock_t lock;
 } trace_clock_struct ____cacheline_aligned_in_smp =
        {
-                .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
+                .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
        };
 u64 notrace trace_clock_global(void)
@@ -94,7 +94,7 @@ u64 notrace trace_clock_global(void)
        if (unlikely(in_nmi()))
                goto out;
-        __raw_spin_lock(&trace_clock_struct.lock);
+        arch_spin_lock(&trace_clock_struct.lock);
        /*
         * TODO: if this happens often then maybe we should reset
@@ -106,7 +106,7 @@ u64 notrace trace_clock_global(void)
        trace_clock_struct.prev_time = now;
-        __raw_spin_unlock(&trace_clock_struct.lock);
+        arch_spin_unlock(&trace_clock_struct.lock);
 out:
        raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index d9c60f80aa0..9e25573242c 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -25,7 +25,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
        char *buf;
        int ret = -ENOMEM;
-        if (atomic_inc_return(&event->profile_count))
+        if (event->profile_count++ > 0)
                return 0;
        if (!total_profile_count) {
@@ -56,7 +56,7 @@ fail_buf_nmi:
                perf_trace_buf = NULL;
        }
 fail_buf:
-        atomic_dec(&event->profile_count);
+        event->profile_count--;
        return ret;
 }
@@ -83,7 +83,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
        char *buf, *nmi_buf;
-        if (!atomic_add_negative(-1, &event->profile_count))
+        if (--event->profile_count > 0)
                return;
        event->profile_disable(event);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d18315dc83..189b09baf4f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
        if (ret)                                                        \
                return ret;
-int trace_define_common_fields(struct ftrace_event_call *call)
+static int trace_define_common_fields(struct ftrace_event_call *call)
 {
        int ret;
        struct trace_entry ent;
@@ -91,7 +91,6 @@ int trace_define_common_fields(struct ftrace_event_call *call)
        return ret;
 }
-EXPORT_SYMBOL_GPL(trace_define_common_fields);
 void trace_destroy_fields(struct ftrace_event_call *call)
 {
@@ -105,9 +104,25 @@ void trace_destroy_fields(struct ftrace_event_call *call)
        }
 }
-static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+int trace_event_raw_init(struct ftrace_event_call *call)
+{
+        int id;
+        id = register_ftrace_event(call->event);
+        if (!id)
+                return -ENODEV;
+        call->id = id;
+        INIT_LIST_HEAD(&call->fields);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(trace_event_raw_init);
+static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
+        int ret = 0;
        switch (enable) {
        case 0:
                if (call->enabled) {
@@ -118,12 +133,20 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                break;
        case 1:
                if (!call->enabled) {
-                        call->enabled = 1;
                        tracing_start_cmdline_record();
-                        call->regfunc(call);
+                        ret = call->regfunc(call);
+                        if (ret) {
+                                tracing_stop_cmdline_record();
+                                pr_info("event trace: Could not enable event "
+                                        "%s\n", call->name);
+                                break;
+                        }
+                        call->enabled = 1;
                }
                break;
        }
+        return ret;
 }
 static void ftrace_clear_events(void)
@@ -402,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        case 0:
        case 1:
                mutex_lock(&event_mutex);
-                ftrace_event_enable_disable(call, val);
+                ret = ftrace_event_enable_disable(call, val);
                mutex_unlock(&event_mutex);
                break;
@@ -412,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        *ppos += cnt;
-        return cnt;
+        return ret ? ret : cnt;
 }
 static ssize_t
@@ -913,7 +936,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                                  id);
        if (call->define_fields) {
-                ret = call->define_fields(call);
+                ret = trace_define_common_fields(call);
+                if (!ret)
+                        ret = call->define_fields(call);
                if (ret < 0) {
                        pr_warning("Could not initialize trace point"
                                   " events/%s\n", call->name);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 50504cb228d..e42af9aad69 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
 {
        char **addr = (char **)(event + pred->offset);
        int cmp, match;
+        int len = strlen(*addr) + 1;    /* including tailing '\0' */
-        cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
+        cmp = pred->regex.match(*addr, &pred->regex, len);
        match = cmp ^ pred->not;
@@ -251,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
        return 0;
 }
-/* Basic regex callbacks */
+/*
+ * regex_match_foo - Basic regex callbacks
+ *
+ * @str: the string to be searched
+ * @r:   the regex structure containing the pattern string
+ * @len: the length of the string to be searched (including '\0')
+ *
+ * Note:
+ * - @str might not be NULL-terminated if it's of type DYN_STRING
+ *   or STATIC_STRING
+ */
 static int regex_match_full(char *str, struct regex *r, int len)
 {
        if (strncmp(str, r->pattern, len) == 0)
@@ -261,23 +273,24 @@ static int regex_match_full(char *str, struct regex *r, int len)
 static int regex_match_front(char *str, struct regex *r, int len)
 {
-        if (strncmp(str, r->pattern, len) == 0)
+        if (strncmp(str, r->pattern, r->len) == 0)
                return 1;
        return 0;
 }
 static int regex_match_middle(char *str, struct regex *r, int len)
 {
-        if (strstr(str, r->pattern))
+        if (strnstr(str, r->pattern, len))
                return 1;
        return 0;
 }
 static int regex_match_end(char *str, struct regex *r, int len)
 {
-        char *ptr = strstr(str, r->pattern);
+        int strlen = len - 1;
-        if (ptr && (ptr[r->len] == 0))
+        if (strlen >= r->len &&
+            memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
                return 1;
        return 0;
 }
@@ -781,10 +794,8 @@ static int filter_add_pred(struct filter_parse_state *ps,
                        pred->regex.field_len = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING)
                        fn = filter_pred_strloc;
-                else {
+                else
                        fn = filter_pred_pchar;
-                        pred->regex.field_len = strlen(pred->regex.pattern);
-                }
        } else {
                if (field->is_signed)
                        ret = strict_strtoll(pred->regex.pattern, 0, &val);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index dff8c84ddf1..d4fa5dc1ee4 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
        BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
                                 offsetof(typeof(field), item),         \
-                                 sizeof(field.item), 0, FILTER_OTHER);  \
+                                 sizeof(field.item),                    \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
@@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
                                 offsetof(typeof(field),                \
                                          container.item),              \
-                                 sizeof(field.container.item), 0,       \
+                                 sizeof(field.container.item),          \
-                                 FILTER_OTHER);                         \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
@@ -184,10 +185,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
        struct struct_name field;                                       \
        int ret;                                                        \
                                                                        \
-        ret = trace_define_common_fields(event_call);                   \
-        if (ret)                                                        \
-                return ret;                                             \
-                                                                        \
        tstruct;                                                        \
                                                                        \
        return ret;                                                     \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4..b1342c5d37c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
 #include "trace.h"
 #include "trace_output.h"
-struct fgraph_data {
+struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             ignore;
+};
+struct fgraph_data {
+        struct fgraph_cpu_data          *cpu_data;
+        /* Place to preserve last processed entry. */
+        struct ftrace_graph_ent_entry   ent;
+        struct ftrace_graph_ret_entry   ret;
+        int                             failed;
+        int                             cpu;
 };
 #define TRACE_GRAPH_INDENT      2
@@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
                return 0;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
        if (!data)
                return TRACE_TYPE_HANDLED;
-        last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
        if (*last_pid == pid)
                return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
 get_return_for_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *curr)
 {
-        struct ring_buffer_iter *ring_iter;
+        struct fgraph_data *data = iter->private;
+        struct ring_buffer_iter *ring_iter = NULL;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *next;
-        ring_iter = iter->buffer_iter[iter->cpu];
+        /*
+         * If the previous output failed to write to the seq buffer,
+         * then we just reuse the data from before.
+         */
+        if (data && data->failed) {
+                curr = &data->ent;
+                next = &data->ret;
+        } else {
-        /* First peek to compare current entry and the next one */
+                ring_iter = iter->buffer_iter[iter->cpu];
-        if (ring_iter)
-                event = ring_buffer_iter_peek(ring_iter, NULL);
+                /* First peek to compare current entry and the next one */
-        else {
+                if (ring_iter)
-        /* We need to consume the current entry to see the next one */
+                        event = ring_buffer_iter_peek(ring_iter, NULL);
-                ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                else {
-                event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                        /*
-                                        NULL);
+                         * We need to consume the current entry to see
-        }
+                         * the next one.
+                         */
+                        ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                                                 NULL);
+                }
-        if (!event)
+                if (!event)
-                return NULL;
+                        return NULL;
+                next = ring_buffer_event_data(event);
-        next = ring_buffer_event_data(event);
+                if (data) {
+                        /*
+                         * Save current and next entries for later reference
+                         * if the output fails.
+                         */
+                        data->ent = *curr;
+                        data->ret = *next;
+                }
+        }
        if (next->ent.type != TRACE_GRAPH_RET)
                return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                /*
                 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                *depth = call->depth;
        }
@@ -782,19 +816,34 @@ static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter)
 {
-        int cpu = iter->cpu;
+        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
        struct ftrace_graph_ret_entry *leaf_ret;
+        static enum print_line_t ret;
+        int cpu = iter->cpu;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-                return print_graph_entry_leaf(iter, field, leaf_ret, s);
+                ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
        else
-                return print_graph_entry_nested(iter, field, s, cpu);
+                ret = print_graph_entry_nested(iter, field, s, cpu);
+        if (data) {
+                /*
+                 * If we failed to write our output, then we need to make
+                 * note of it. Because we already consumed our entry.
+                 */
+                if (s->full) {
+                        data->failed = 1;
+                        data->cpu = cpu;
+                } else
+                        data->failed = 0;
+        }
+        return ret;
 }
 static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                /*
                 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        int i;
        if (data)
-                depth = per_cpu_ptr(data, iter->cpu)->depth;
+                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
        if (print_graph_prologue(iter, s, 0, 0))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
+        struct ftrace_graph_ent_entry *field;
+        struct fgraph_data *data = iter->private;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
+        int cpu = iter->cpu;
+        int ret;
+        if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+                per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+                return TRACE_TYPE_HANDLED;
+        }
+        /*
+         * If the last output failed, there's a possibility we need
+         * to print out the missing entry which would never go out.
+         */
+        if (data && data->failed) {
+                field = &data->ent;
+                iter->cpu = data->cpu;
+                ret = print_graph_entry(field, s, iter);
+                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+                        ret = TRACE_TYPE_NO_CONSUME;
+                }
+                iter->cpu = cpu;
+                return ret;
+        }
        switch (entry->type) {
        case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
                 * sizeof(struct ftrace_graph_ent_entry) is very small,
                 * it can be safely saved at the stack.
                 */
-                struct ftrace_graph_ent_entry *field, saved;
+                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
                return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
 static void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
-        struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+        struct fgraph_data *data;
        int cpu;
+        iter->private = NULL;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
-                pr_warning("function graph tracer: not enough memory\n");
+                goto out_err;
-        else
-                for_each_possible_cpu(cpu) {
+        data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
-                        pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        if (!data->cpu_data)
-                        int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                goto out_err_free;
-                        *pid = -1;
-                        *depth = 0;
+        for_each_possible_cpu(cpu) {
-                }
+                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                *pid = -1;
+                *depth = 0;
+                *ignore = 0;
+        }
        iter->private = data;
+        return;
+ out_err_free:
+        kfree(data);
+ out_err:
+        pr_warning("function graph tracer: not enough memory\n");
 }
 static void graph_trace_close(struct trace_iterator *iter)
 {
-        free_percpu(iter->private);
+        struct fgraph_data *data = iter->private;
+        if (data) {
+                free_percpu(data->cpu_data);
+                kfree(data);
+        }
 }
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
+        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
+        .pipe_close     = graph_trace_close,
        .wait_pipe      = poll_wait_pipe,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd..7b97000745f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
 #define BTS_BUFFER_SIZE (1 << 13)
-static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
 static int trace_hw_branches_enabled __read_mostly;
 static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
 static void bts_trace_init_cpu(int cpu)
 {
-        per_cpu(tracer, cpu) =
+        per_cpu(hwb_tracer, cpu) =
-                ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+                ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-                                   NULL, (size_t)-1, BTS_KERNEL);
+                                   BTS_BUFFER_SIZE, NULL, (size_t)-1,
+                                   BTS_KERNEL);
-        if (IS_ERR(per_cpu(tracer, cpu)))
+        if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-                per_cpu(tracer, cpu) = NULL;
+                per_cpu(hwb_tracer, cpu) = NULL;
 }
 static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
        for_each_online_cpu(cpu) {
                bts_trace_init_cpu(cpu);
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
                        trace_hw_branches_enabled = 1;
        }
        trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu) {
-                if (likely(per_cpu(tracer, cpu))) {
+                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(tracer, cpu));
+                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(tracer, cpu) = NULL;
+                        per_cpu(hwb_tracer, cpu) = NULL;
                }
        }
        trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(tracer, cpu));
+                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
        trace_hw_branches_suspended = 0;
        put_online_cpus();
 }
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(tracer, cpu));
+                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
        trace_hw_branches_suspended = 1;
        put_online_cpus();
 }
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
                        bts_trace_init_cpu(cpu);
                        if (trace_hw_branches_suspended &&
-                            likely(per_cpu(tracer, cpu)))
+                            likely(per_cpu(hwb_tracer, cpu)))
-                                ds_suspend_bts(per_cpu(tracer, cpu));
+                                ds_suspend_bts(per_cpu(hwb_tracer, cpu));
                }
                break;
        case CPU_DOWN_PREPARE:
                /* The notification is sent with interrupts enabled. */
-                if (likely(per_cpu(tracer, cpu))) {
+                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(tracer, cpu));
+                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(tracer, cpu) = NULL;
+                        per_cpu(hwb_tracer, cpu) = NULL;
                }
        }
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(tracer, cpu));
+                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
        /*
         * We need to collect the trace on the respective cpu since ftrace
         * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
        on_each_cpu(trace_bts_cpu, iter->tr, 1);
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(tracer, cpu));
+                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
        put_online_cpus();
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114..2974bc7538c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
                goto out_unlock;
        trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+        /* Skip 5 functions to get to the irq/preempt enable function */
+        __trace_stack(tr, flags, 5, pc);
        if (data->critical_sequence != max_sequence)
                goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aff5f80b59b..50b1b823980 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -282,6 +282,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
                                struct pt_regs *regs);
+/* Check the name is good for event/group */
+static int check_event_name(const char *name)
+{
+        if (!isalpha(*name) && *name != '_')
+                return 0;
+        while (*++name != '\0') {
+                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+                        return 0;
+        }
+        return 1;
+}
 /*
 * Allocate new trace_probe and initialize it (including kprobes).
 */
@@ -293,10 +305,11 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                                             int nargs, int is_return)
 {
        struct trace_probe *tp;
+        int ret = -ENOMEM;
        tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
        if (!tp)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(ret);
        if (symbol) {
                tp->symbol = kstrdup(symbol, GFP_KERNEL);
@@ -312,14 +325,20 @@ static struct trace_probe *alloc_trace_probe(const char *group,
        else
                tp->rp.kp.pre_handler = kprobe_dispatcher;
-        if (!event)
+        if (!event || !check_event_name(event)) {
+                ret = -EINVAL;
                goto error;
+        }
        tp->call.name = kstrdup(event, GFP_KERNEL);
        if (!tp->call.name)
                goto error;
-        if (!group)
+        if (!group || !check_event_name(group)) {
+                ret = -EINVAL;
                goto error;
+        }
        tp->call.system = kstrdup(group, GFP_KERNEL);
        if (!tp->call.system)
                goto error;
@@ -330,7 +349,7 @@ error:
        kfree(tp->call.name);
        kfree(tp->symbol);
        kfree(tp);
-        return ERR_PTR(-ENOMEM);
+        return ERR_PTR(ret);
 }
 static void free_probe_arg(struct probe_arg *arg)
@@ -606,23 +625,22 @@ static int create_trace_probe(int argc, char **argv)
         */
        struct trace_probe *tp;
        int i, ret = 0;
-        int is_return = 0;
+        int is_return = 0, is_delete = 0;
        char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
        unsigned long offset = 0;
        void *addr = NULL;
        char buf[MAX_EVENT_NAME_LEN];
-        if (argc < 2) {
+        /* argc must be >= 1 */
-                pr_info("Probe point is not specified.\n");
-                return -EINVAL;
-        }
        if (argv[0][0] == 'p')
                is_return = 0;
        else if (argv[0][0] == 'r')
                is_return = 1;
+        else if (argv[0][0] == '-')
+                is_delete = 1;
        else {
-                pr_info("Probe definition must be started with 'p' or 'r'.\n");
+                pr_info("Probe definition must be started with 'p', 'r' or"
+                        " '-'.\n");
                return -EINVAL;
        }
@@ -642,14 +660,36 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
        }
+        if (!group)
+                group = KPROBE_EVENT_SYSTEM;
+        if (is_delete) {
+                if (!event) {
+                        pr_info("Delete command needs an event name.\n");
+                        return -EINVAL;
+                }
+                tp = find_probe_event(event, group);
+                if (!tp) {
+                        pr_info("Event %s/%s doesn't exist.\n", group, event);
+                        return -ENOENT;
+                }
+                /* delete an event */
+                unregister_trace_probe(tp);
+                free_trace_probe(tp);
+                return 0;
+        }
+        if (argc < 2) {
+                pr_info("Probe point is not specified.\n");
+                return -EINVAL;
+        }
        if (isdigit(argv[1][0])) {
                if (is_return) {
                        pr_info("Return probe point must be a symbol.\n");
                        return -EINVAL;
                }
                /* an address specified */
-                ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
                        pr_info("Failed to parse address.\n");
                        return ret;
@@ -671,15 +711,13 @@ static int create_trace_probe(int argc, char **argv)
        argc -= 2; argv += 2;
        /* setup a probe */
-        if (!group)
-                group = KPROBE_EVENT_SYSTEM;
        if (!event) {
                /* Make a new event name */
                if (symbol)
-                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
                                 is_return ? 'r' : 'p', symbol, offset);
                else
-                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
                                 is_return ? 'r' : 'p', addr);
                event = buf;
        }
@@ -1113,10 +1151,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
        struct kprobe_trace_entry field;
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
-        ret = trace_define_common_fields(event_call);
-        if (!ret)
-                return ret;
        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
        /* Set argument names as fields */
@@ -1131,10 +1165,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        struct kretprobe_trace_entry field;
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
-        ret = trace_define_common_fields(event_call);
-        if (!ret)
-                return ret;
        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
        DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
@@ -1171,10 +1201,11 @@ static int __probe_event_show_format(struct trace_seq *s,
 #undef SHOW_FIELD
 #define SHOW_FIELD(type, item, name)                                    \
        do {                                                            \
-                ret = trace_seq_printf(s, "\tfield: " #type " %s;\t"    \
+                ret = trace_seq_printf(s, "\tfield:" #type " %s;\t"     \
-                                "offset:%u;\tsize:%u;\n", name,         \
+                                "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
                                (unsigned int)offsetof(typeof(field), item),\
-                                (unsigned int)sizeof(type));            \
+                                (unsigned int)sizeof(type),             \
+                                is_signed_type(type));                  \
                if (!ret)                                               \
                        return 0;                                       \
        } while (0)
@@ -1434,7 +1465,6 @@ static int register_probe_event(struct trace_probe *tp)
        call->unregfunc = probe_event_disable;
 #ifdef CONFIG_EVENT_PROFILE
-        atomic_set(&call->profile_count, -1);
        call->profile_enable = probe_profile_enable;
        call->profile_disable = probe_profile_disable;
 #endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index ddfa0fd43bc..94103cdcf9d 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -26,12 +26,13 @@
 #include <linux/fs.h>
 #include "trace_output.h"
-#include "trace_stat.h"
 #include "trace.h"
 #include <linux/hw_breakpoint.h>
 #include <asm/hw_breakpoint.h>
+#include <asm/atomic.h>
 /*
 * For now, let us restrict the no. of symbols traced simultaneously to number
 * of available hardware breakpoint registers.
@@ -44,7 +45,7 @@ struct trace_ksym {
        struct perf_event       **ksym_hbp;
        struct perf_event_attr  attr;
 #ifdef CONFIG_PROFILE_KSYM_TRACER
-        unsigned long           counter;
+        atomic64_t              counter;
 #endif
        struct hlist_node       ksym_hlist;
 };
@@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
        rcu_read_lock();
        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-                if ((entry->attr.bp_addr == hbp_hit_addr) &&
+                if (entry->attr.bp_addr == hbp_hit_addr) {
-                    (entry->counter <= MAX_UL_INT)) {
+                        atomic64_inc(&entry->counter);
-                        entry->counter++;
                        break;
                }
        }
@@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 }
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
-void ksym_hbp_handler(struct perf_event *hbp, void *data)
+void ksym_hbp_handler(struct perf_event *hbp, int nmi,
+                      struct perf_sample_data *data,
+                      struct pt_regs *regs)
 {
        struct ring_buffer_event *event;
        struct ksym_trace_entry *entry;
-        struct pt_regs *regs = data;
        struct ring_buffer *buffer;
        int pc;
@@ -196,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
        entry->attr.bp_addr = addr;
        entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
-        ret = -EAGAIN;
        entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
                                        ksym_hbp_handler);
@@ -235,7 +235,8 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
        mutex_lock(&ksym_tracer_mutex);
        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-                ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
+                ret = trace_seq_printf(s, "%pS:",
+                                (void *)(unsigned long)entry->attr.bp_addr);
                if (entry->attr.bp_type == HW_BREAKPOINT_R)
                        ret = trace_seq_puts(s, "r--\n");
                else if (entry->attr.bp_type == HW_BREAKPOINT_W)
@@ -277,21 +278,20 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 {
        struct trace_ksym *entry;
        struct hlist_node *node;
-        char *input_string, *ksymname = NULL;
+        char *buf, *input_string, *ksymname = NULL;
        unsigned long ksym_addr = 0;
        int ret, op, changed = 0;
-        input_string = kzalloc(count + 1, GFP_KERNEL);
+        buf = kzalloc(count + 1, GFP_KERNEL);
-        if (!input_string)
+        if (!buf)
                return -ENOMEM;
-        if (copy_from_user(input_string, buffer, count)) {
+        ret = -EFAULT;
-                kfree(input_string);
+        if (copy_from_user(buf, buffer, count))
-                return -EFAULT;
+                goto out;
-        }
-        input_string[count] = '\0';
-        strstrip(input_string);
+        buf[count] = '\0';
+        input_string = strstrip(buf);
        /*
         * Clear all breakpoints if:
@@ -302,15 +302,13 @@ static ssize_t ksym_trace_filter_write(struct file *file,
        if (!input_string[0] || !strcmp(input_string, "0") ||
            !strcmp(input_string, "*:---")) {
                __ksym_trace_reset();
-                kfree(input_string);
+                ret = 0;
-                return count;
+                goto out;
        }
        ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
-        if (ret < 0) {
+        if (ret < 0)
-                kfree(input_string);
+                goto out;
-                return ret;
-        }
        mutex_lock(&ksym_tracer_mutex);
@@ -321,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
                        if (entry->attr.bp_type != op)
                                changed = 1;
                        else
-                                goto out;
+                                goto out_unlock;
                        break;
                }
        }
@@ -336,28 +334,24 @@ static ssize_t ksym_trace_filter_write(struct file *file,
                        if (IS_ERR(entry->ksym_hbp))
                                ret = PTR_ERR(entry->ksym_hbp);
                        else
-                                goto out;
+                                goto out_unlock;
                }
                /* Error or "symbol:---" case: drop it */
                ksym_filter_entry_count--;
                hlist_del_rcu(&(entry->ksym_hlist));
                synchronize_rcu();
                kfree(entry);
-                goto out;
+                goto out_unlock;
        } else {
                /* Check for malformed request: (4) */
-                if (op == 0)
+                if (op)
-                        goto out;
+                        ret = process_new_ksym_entry(ksymname, op, ksym_addr);
-                ret = process_new_ksym_entry(ksymname, op, ksym_addr);
        }
-out:
+out_unlock:
        mutex_unlock(&ksym_tracer_mutex);
+out:
-        kfree(input_string);
+        kfree(buf);
+        return !ret ? count : ret;
-        if (!ret)
-                ret = count;
-        return ret;
 }
 static const struct file_operations ksym_tracing_fops = {
@@ -449,102 +443,77 @@ struct tracer ksym_tracer __read_mostly =
        .print_line     = ksym_trace_output
 };
-__init static int init_ksym_trace(void)
-{
-        struct dentry *d_tracer;
-        struct dentry *entry;
-        d_tracer = tracing_init_dentry();
-        ksym_filter_entry_count = 0;
-        entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
-                                    NULL, &ksym_tracing_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'ksym_trace_filter' file\n");
-        return register_tracer(&ksym_tracer);
-}
-device_initcall(init_ksym_trace);
 #ifdef CONFIG_PROFILE_KSYM_TRACER
-static int ksym_tracer_stat_headers(struct seq_file *m)
+static int ksym_profile_show(struct seq_file *m, void *v)
 {
+        struct hlist_node *node;
+        struct trace_ksym *entry;
+        int access_type = 0;
+        char fn_name[KSYM_NAME_LEN];
        seq_puts(m, "  Access Type ");
        seq_puts(m, "  Symbol                                       Counter\n");
        seq_puts(m, "  ----------- ");
        seq_puts(m, "  ------                                       -------\n");
-        return 0;
-}
-static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+        rcu_read_lock();
-{
+        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-        struct hlist_node *stat = v;
-        struct trace_ksym *entry;
-        int access_type = 0;
-        char fn_name[KSYM_NAME_LEN];
-        entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+                access_type = entry->attr.bp_type;
-        access_type = entry->attr.bp_type;
+                switch (access_type) {
+                case HW_BREAKPOINT_R:
+                        seq_puts(m, "  R           ");
+                        break;
+                case HW_BREAKPOINT_W:
+                        seq_puts(m, "  W           ");
+                        break;
+                case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+                        seq_puts(m, "  RW          ");
+                        break;
+                default:
+                        seq_puts(m, "  NA          ");
+                }
-        switch (access_type) {
+                if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
-        case HW_BREAKPOINT_R:
+                        seq_printf(m, "  %-36s", fn_name);
-                seq_puts(m, "  R           ");
+                else
-                break;
+                        seq_printf(m, "  %-36s", "<NA>");
-        case HW_BREAKPOINT_W:
+                seq_printf(m, " %15llu\n",
-                seq_puts(m, "  W           ");
+                           (unsigned long long)atomic64_read(&entry->counter));
-                break;
-        case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-                seq_puts(m, "  RW          ");
-                break;
-        default:
-                seq_puts(m, "  NA          ");
        }
+        rcu_read_unlock();
-        if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
-                seq_printf(m, "  %-36s", fn_name);
-        else
-                seq_printf(m, "  %-36s", "<NA>");
-        seq_printf(m, " %15lu\n", entry->counter);
        return 0;
 }
-static void *ksym_tracer_stat_start(struct tracer_stat *trace)
+static int ksym_profile_open(struct inode *node, struct file *file)
 {
-        return ksym_filter_head.first;
+        return single_open(file, ksym_profile_show, NULL);
 }
-static void *
+static const struct file_operations ksym_profile_fops = {
-ksym_tracer_stat_next(void *v, int idx)
+        .open           = ksym_profile_open,
-{
+        .read           = seq_read,
-        struct hlist_node *stat = v;
+        .llseek         = seq_lseek,
+        .release        = single_release,
-        return stat->next;
-}
-static struct tracer_stat ksym_tracer_stats = {
-        .name = "ksym_tracer",
-        .stat_start = ksym_tracer_stat_start,
-        .stat_next = ksym_tracer_stat_next,
-        .stat_headers = ksym_tracer_stat_headers,
-        .stat_show = ksym_tracer_stat_show
 };
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
-__init static int ksym_tracer_stat_init(void)
+__init static int init_ksym_trace(void)
 {
-        int ret;
+        struct dentry *d_tracer;
-        ret = register_stat_tracer(&ksym_tracer_stats);
+        d_tracer = tracing_init_dentry();
-        if (ret) {
-                printk(KERN_WARNING "Warning: could not register "
-                                    "ksym tracer stats\n");
-                return 1;
-        }
-        return 0;
+        trace_create_file("ksym_trace_filter", 0644, d_tracer,
+                          NULL, &ksym_tracing_fops);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+        trace_create_file("ksym_profile", 0444, d_tracer,
+                          NULL, &ksym_profile_fops);
+#endif
+        return register_tracer(&ksym_tracer);
 }
-fs_initcall(ksym_tracer_stat_init);
+device_initcall(init_ksym_trace);
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bc..8e46b3323cd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+        int ret;
+        ret = seq_write(m, s->buffer, len);
-        seq_write(m, s->buffer, len);
+        /*
+         * Only reset this buffer if we successfully wrote to the
+         * seq_file buffer.
+         */
+        if (!ret)
+                trace_seq_init(s);
-        trace_seq_init(s);
+        return ret;
 }
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_list ap;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_end(ap);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 {
        int len = strlen(str);
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return 0;
+        }
        memcpy(s->buffer + s->len, str, len);
        s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
+                return 0;
+        }
        s->buffer[s->len++] = c;
        return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
+                return 0;
+        }
        memcpy(s->buffer + s->len, mem, len);
        s->len += len;
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
        const unsigned char *data = mem;
        int i, j;
+        if (s->full)
+                return 0;
 #ifdef __BIG_ENDIAN
        for (i = 0, j = 0; i < len; i++) {
 #else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 {
        void *ret;
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return NULL;
+        }
        ret = s->buffer + s->len;
        s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 {
        unsigned char *p;
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
+                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
                return 0;
+        }
        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
        if (!IS_ERR(p)) {
                p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
                return 1;
        }
+        s->full = 1;
        return 0;
 }
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
        unsigned long vmstart = 0;
        int ret = 1;
+        if (s->full)
+                return 0;
        if (mm) {
                const struct vm_area_struct *vma;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d72767..0271742abb8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int			wakeup_current_cpu;
 static unsigned                 wakeup_prio = -1;
 static int                      wakeup_rt;
-static raw_spinlock_t wakeup_lock =
+static arch_spinlock_t wakeup_lock =
-        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 static void __wakeup_reset(struct trace_array *tr);
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
                goto out;
        local_irq_save(flags);
-        __raw_spin_lock(&wakeup_lock);
+        arch_spin_lock(&wakeup_lock);
        /* We could race with grabbing wakeup_lock */
        if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 out_unlock:
        __wakeup_reset(wakeup_trace);
-        __raw_spin_unlock(&wakeup_lock);
+        arch_spin_unlock(&wakeup_lock);
        local_irq_restore(flags);
 out:
        atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
        tracing_reset_online_cpus(tr);
        local_irq_save(flags);
-        __raw_spin_lock(&wakeup_lock);
+        arch_spin_lock(&wakeup_lock);
        __wakeup_reset(tr);
-        __raw_spin_unlock(&wakeup_lock);
+        arch_spin_unlock(&wakeup_lock);
        local_irq_restore(flags);
 }
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
                goto out;
        /* interrupts should be off from try_to_wake_up */
-        __raw_spin_lock(&wakeup_lock);
+        arch_spin_lock(&wakeup_lock);
        /* check for races. */
        if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 out_locked:
-        __raw_spin_unlock(&wakeup_lock);
+        arch_spin_unlock(&wakeup_lock);
 out:
        atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index dc98309e839..280fea470d6 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
        /* Don't allow flipping of max traces now */
        local_irq_save(flags);
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        cnt = ring_buffer_entries(tr->buffer);
@@ -85,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
                        break;
        }
        tracing_on();
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
        local_irq_restore(flags);
        if (count)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e..f4bc9b27de5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
 };
 static unsigned long max_stack_size;
-static raw_spinlock_t max_stack_lock =
+static arch_spinlock_t max_stack_lock =
-        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
                return;
        local_irq_save(flags);
-        __raw_spin_lock(&max_stack_lock);
+        arch_spin_lock(&max_stack_lock);
        /* a race could have already updated it */
        if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
        }
 out:
-        __raw_spin_unlock(&max_stack_lock);
+        arch_spin_unlock(&max_stack_lock);
        local_irq_restore(flags);
 }
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        unsigned long val, flags;
        char buf[64];
        int ret;
+        int cpu;
        if (count >= sizeof(buf))
                return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
                return ret;
        local_irq_save(flags);
-        __raw_spin_lock(&max_stack_lock);
+        /*
+         * In case we trace inside arch_spin_lock() or after (NMI),
+         * we will cause circular lock, so we also need to increase
+         * the percpu trace_active here.
+         */
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
+        arch_spin_lock(&max_stack_lock);
        *ptr = val;
-        __raw_spin_unlock(&max_stack_lock);
+        arch_spin_unlock(&max_stack_lock);
+        per_cpu(trace_active, cpu)--;
        local_irq_restore(flags);
        return count;
@@ -206,8 +218,14 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+        int cpu;
        local_irq_disable();
-        __raw_spin_lock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
+        arch_spin_lock(&max_stack_lock);
        if (*pos == 0)
                return SEQ_START_TOKEN;
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void t_stop(struct seq_file *m, void *p)
 {
-        __raw_spin_unlock(&max_stack_lock);
+        int cpu;
+        arch_spin_unlock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)--;
        local_irq_enable();
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 57501d90096..75289f372dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -217,10 +217,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
        int i;
        int offset = offsetof(typeof(trace), args);
-        ret = trace_define_common_fields(call);
-        if (ret)
-                return ret;
        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
        if (ret)
                return ret;
@@ -241,10 +237,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        struct syscall_trace_exit trace;
        int ret;
-        ret = trace_define_common_fields(call);
-        if (ret)
-                return ret;
        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
        if (ret)
                return ret;
@@ -333,10 +325,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
                ret = register_trace_sys_enter(ftrace_syscall_enter);
-        if (ret) {
+        if (!ret) {
-                pr_info("event trace: Could not activate"
-                                "syscall entry trace point");
-        } else {
                set_bit(num, enabled_enter_syscalls);
                sys_refcount_enter++;
        }
@@ -370,10 +359,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
                ret = register_trace_sys_exit(ftrace_syscall_exit);
-        if (ret) {
+        if (!ret) {
-                pr_info("event trace: Could not activate"
-                                "syscall exit trace point");
-        } else {
                set_bit(num, enabled_exit_syscalls);
                sys_refcount_exit++;
        }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287..a7974a552ca 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
        .warning_symbol         = backtrace_warning_symbol,
        .stack                  = backtrace_stack,
        .address                = backtrace_address,
+        .walk_stack             = print_context_stack,
 };
 static int
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 03e2d6fd9b1..eb27fd3430a 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -6,8 +6,6 @@
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
-#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
 /*
 * Request a notification when the current cpu returns to userspace.  Must be
 * called in atomic context.  The notifier will also be called in atomic
@@ -16,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
 void user_return_notifier_register(struct user_return_notifier *urn)
 {
        set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
-        hlist_add_head(&urn->link, &URN_LIST_HEAD);
+        hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_register);
@@ -27,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
        hlist_del(&urn->link);
-        if (hlist_empty(&URN_LIST_HEAD))
+        if (hlist_empty(&__get_cpu_var(return_notifier_list)))
                clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae8..dee48658805 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
 #endif
 };
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+static struct debug_obj_descr work_debug_descr;
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int work_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_init(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int work_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. The work struct was
+                 * statically initialized. We just make sure that it
+                 * is tracked in the object tracker.
+                 */
+                if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+                        debug_object_init(work, &work_debug_descr);
+                        debug_object_activate(work, &work_debug_descr);
+                        return 0;
+                }
+                WARN_ON_ONCE(1);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int work_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_free(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr work_debug_descr = {
+        .name           = "work_struct",
+        .fixup_init     = work_fixup_init,
+        .fixup_activate = work_fixup_activate,
+        .fixup_free     = work_fixup_free,
+};
+static inline void debug_work_activate(struct work_struct *work)
+{
+        debug_object_activate(work, &work_debug_descr);
+}
+static inline void debug_work_deactivate(struct work_struct *work)
+{
+        debug_object_deactivate(work, &work_debug_descr);
+}
+void __init_work(struct work_struct *work, int onstack)
+{
+        if (onstack)
+                debug_object_init_on_stack(work, &work_debug_descr);
+        else
+                debug_object_init(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(__init_work);
+void destroy_work_on_stack(struct work_struct *work)
+{
+        debug_object_free(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+#else
+static inline void debug_work_activate(struct work_struct *work) { }
+static inline void debug_work_deactivate(struct work_struct *work) { }
+#endif
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 {
        unsigned long flags;
+        debug_work_activate(work);
        spin_lock_irqsave(&cwq->lock, flags);
        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
                trace_workqueue_execution(cwq->thread, work);
+                debug_work_deactivate(work);
                cwq->current_work = work;
                list_del_init(cwq->worklist.next);
                spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
                        struct wq_barrier *barr, struct list_head *head)
 {
-        INIT_WORK(&barr->work, wq_barrier_func);
+        /*
+         * debugobject calls are safe here even with cwq->lock locked
+         * as we know for sure that this will not trigger any of the
+         * checks and call back into the fixup functions where we
+         * might deadlock.
+         */
+        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        debug_work_activate(&barr->work);
        insert_work(cwq, &barr->work, head);
 }
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
        spin_unlock_irq(&cwq->lock);
-        if (active)
+        if (active) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
        return active;
 }
@@ -451,6 +572,7 @@ out:
                return 0;
        wait_for_completion(&barr.done);
+        destroy_work_on_stack(&barr.work);
        return 1;
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
                 */
                smp_rmb();
                if (cwq == get_wq_data(work)) {
+                        debug_work_deactivate(work);
                        list_del_init(&work->entry);
                        ret = 1;
                }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        }
        spin_unlock_irq(&cwq->lock);
-        if (unlikely(running))
+        if (unlikely(running)) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
 }
 static void wait_on_work(struct work_struct *work)