19 files changed, 235 insertions, 103 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ca83b73fba19..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                return -EFAULT;
        buffer[nbytes] = 0;     /* nul-terminate */
-        strstrip(buffer);
        if (cft->write_u64) {
-                u64 val = simple_strtoull(buffer, &end, 0);
+                u64 val = simple_strtoull(strstrip(buffer), &end, 0);
                if (*end)
                        return -EINVAL;
                retval = cft->write_u64(cgrp, cft, val);
        } else {
-                s64 val = simple_strtoll(buffer, &end, 0);
+                s64 val = simple_strtoll(strstrip(buffer), &end, 0);
                if (*end)
                        return -EINVAL;
                retval = cft->write_s64(cgrp, cft, val);
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        strstrip(buffer);
+        retval = cft->write_string(cgrp, cft, strstrip(buffer));
-        retval = cft->write_string(cgrp, cft, buffer);
        if (!retval)
                retval = nbytes;
 out:
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f80123..f7864ac2ecc1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -359,10 +359,8 @@ void __set_special_pids(struct pid *pid)
 {
        struct task_struct *curr = current->group_leader;
-        if (task_session(curr) != pid) {
+        if (task_session(curr) != pid)
                change_pid(curr, PIDTYPE_SID, pid);
-                proc_sid_connector(curr);
-        }
        if (task_pgrp(curr) != pid)
                change_pid(curr, PIDTYPE_PGID, pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index 4949d336d88d..642f3bbaacc7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
 */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
 {
-        return (key1->both.word == key2->both.word
+        return (key1 && key2
+                && key1->both.word == key2->both.word
                && key1->both.ptr == key2->both.ptr
                && key1->both.offset == key2->both.offset);
 }
@@ -1028,7 +1029,6 @@ static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
                           struct futex_hash_bucket *hb)
 {
-        drop_futex_key_refs(&q->key);
        get_futex_key_refs(key);
        q->key = *key;
@@ -1226,6 +1226,7 @@ retry_private:
                 */
                if (ret == 1) {
                        WARN_ON(pi_state);
+                        drop_count++;
                        task_count++;
                        ret = get_futex_value_locked(&curval2, uaddr2);
                        if (!ret)
@@ -1304,6 +1305,7 @@ retry_private:
                        if (ret == 1) {
                                /* We got the lock. */
                                requeue_pi_wake_futex(this, &key2, hb2);
+                                drop_count++;
                                continue;
                        } else if (ret) {
                                /* -EDEADLK */
@@ -1791,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
                                             current->timer_slack_ns);
        }
+retry:
        /* Prepare to wait on uaddr. */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
@@ -1808,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
                goto out_put_key;
        /*
-         * We expect signal_pending(current), but another thread may
+         * We expect signal_pending(current), but we might be the
-         * have handled it for us already.
+         * victim of a spurious wakeup as well.
         */
+        if (!signal_pending(current)) {
+                put_futex_key(fshared, &q.key);
+                goto retry;
+        }
        ret = -ERESTARTSYS;
        if (!abs_time)
                goto out_put_key;
@@ -2118,9 +2126,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 */
                plist_del(&q->list, &q->list.plist);
+                /* Handle spurious wakeups gracefully */
+                ret = -EAGAIN;
                if (timeout && !timeout->task)
                        ret = -ETIMEDOUT;
-                else
+                else if (signal_pending(current))
                        ret = -ERESTARTNOINTR;
        }
        return ret;
@@ -2198,6 +2208,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
+retry:
        key2 = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
@@ -2292,6 +2303,9 @@ out_put_keys:
 out_key2:
        put_futex_key(fshared, &key2);
+        /* Spurious wakeup ? */
+        if (ret == -EAGAIN)
+                goto retry;
 out:
        if (to) {
                hrtimer_cancel(&to->timer);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3815ac1d58b2..9af56723c096 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+static inline u64 lockstat_clock(void)
+{
+        return cpu_clock(smp_processor_id());
+}
 static int lock_point(unsigned long points[], unsigned long ip)
 {
        int i;
@@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip)
        return i;
 }
-static void lock_time_inc(struct lock_time *lt, s64 time)
+static void lock_time_inc(struct lock_time *lt, u64 time)
 {
        if (time > lt->max)
                lt->max = time;
@@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats)
 static void lock_release_holdtime(struct held_lock *hlock)
 {
        struct lock_class_stats *stats;
-        s64 holdtime;
+        u64 holdtime;
        if (!lock_stat)
                return;
-        holdtime = sched_clock() - hlock->holdtime_stamp;
+        holdtime = lockstat_clock() - hlock->holdtime_stamp;
        stats = get_lock_stats(hlock_class(hlock));
        if (hlock->read)
@@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        hlock->references = references;
 #ifdef CONFIG_LOCK_STAT
        hlock->waittime_stamp = 0;
-        hlock->holdtime_stamp = sched_clock();
+        hlock->holdtime_stamp = lockstat_clock();
 #endif
        if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3322,7 +3327,7 @@ found_it:
        if (hlock->instance != lock)
                return;
-        hlock->waittime_stamp = sched_clock();
+        hlock->waittime_stamp = lockstat_clock();
        contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
        contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
        struct held_lock *hlock, *prev_hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
-        u64 now;
+        u64 now, waittime = 0;
-        s64 waittime = 0;
        int i, cpu;
        depth = curr->lockdep_depth;
@@ -3374,7 +3378,7 @@ found_it:
        cpu = smp_processor_id();
        if (hlock->waittime_stamp) {
-                now = sched_clock();
+                now = lockstat_clock();
                waittime = now - hlock->waittime_stamp;
                hlock->holdtime_stamp = now;
        }
diff --git a/kernel/params.c b/kernel/params.c
index 9da58eabdcb2..d656c276508d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -218,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
                return -ENOSPC;
        }
-        if (kp->flags & KPARAM_KMALLOCED)
-                kfree(*(char **)kp->arg);
        /* This is a hack.  We can't need to strdup in early boot, and we
         * don't need to; this mangled commandline is preserved. */
        if (slab_is_available()) {
-                kp->flags |= KPARAM_KMALLOCED;
                *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
-                if (!kp->arg)
+                if (!*(char **)kp->arg)
                        return -ENOMEM;
        } else
                *(const char **)kp->arg = val;
@@ -304,6 +300,7 @@ static int param_array(const char *name,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, struct kernel_param *kp),
+                       u16 flags,
                       unsigned int *num)
 {
        int ret;
@@ -313,6 +310,7 @@ static int param_array(const char *name,
        /* Get the name right for errors. */
        kp.name = name;
        kp.arg = elem;
+        kp.flags = flags;
        /* No equals sign? */
        if (!val) {
@@ -358,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
        unsigned int temp_num;
        return param_array(kp->name, val, 1, arr->max, arr->elem,
-                           arr->elemsize, arr->set, arr->num ?: &temp_num);
+                           arr->elemsize, arr->set, kp->flags,
+                           arr->num ?: &temp_num);
 }
 int param_array_get(char *buffer, struct kernel_param *kp)
@@ -605,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod)
 void destroy_params(const struct kernel_param *params, unsigned num)
 {
-        unsigned int i;
+        /* FIXME: This should free kmalloced charp parameters.  It doesn't. */
-        for (i = 0; i < num; i++)
-                if (params[i].flags & KPARAM_KMALLOCED)
-                        kfree(*(char **)params[i].arg);
 }
 static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
 * The time it takes is system-specific though, so when we test this
 * during system bootup we allow a LOT of time.
 */
-#define TEST_SUSPEND_SECONDS    5
+#define TEST_SUSPEND_SECONDS    10
 static unsigned long suspend_test_start_time;
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
         * has some performance issues.  The stack dump of a WARN_ON
         * is more likely to get the right attention than a printk...
         */
-        WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+        WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
+             "Component: %s, time: %u\n", label, msec);
 }
 /*
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 705f02ac7433..0536125b0497 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
-                rcu_preempt_offline_tasks(rsp, rnp, rdp);
+                /*
+                 * If there was a task blocking the current grace period,
+                 * and if all CPUs have checked in, we need to propagate
+                 * the quiescent state up the rcu_node hierarchy.  But that
+                 * is inconvenient at the moment due to deadlock issues if
+                 * this should end the current grace period.  So set the
+                 * offlined CPU's bit in ->qsmask in order to force the
+                 * next force_quiescent_state() invocation to clean up this
+                 * mess in a deadlock-free manner.
+                 */
+                if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
+                        rnp->qsmask |= mask;
                mask = rnp->grpmask;
                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
                rnp = rnp->parent;
@@ -958,7 +971,7 @@ static void rcu_offline_cpu(int cpu)
 * Invoke any RCU callbacks that have made it to the end of their grace
 * period.  Thottle as specified by rdp->blimit.
 */
-static void rcu_do_batch(struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
@@ -1011,6 +1024,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
        if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
                rdp->blimit = blimit;
+        /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+        if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+                rdp->qlen_last_fqs_check = 0;
+                rdp->n_force_qs_snap = rsp->n_force_qs;
+        } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+                rdp->qlen_last_fqs_check = rdp->qlen;
        local_irq_restore(flags);
        /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1224,7 +1244,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* If there are callbacks ready, invoke them. */
-        rcu_do_batch(rdp);
+        rcu_do_batch(rsp, rdp);
 }
 /*
@@ -1288,10 +1308,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
        }
-        /* Force the grace period if too many callbacks or too long waiting. */
+        /*
-        if (unlikely(++rdp->qlen > qhimark)) {
+         * Force the grace period if too many callbacks or too long waiting.
+         * Enforce hysteresis, and don't invoke force_quiescent_state()
+         * if some other CPU has recently done so.  Also, don't bother
+         * invoking force_quiescent_state() if the newly enqueued callback
+         * is the only one waiting for a grace period to complete.
+         */
+        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
                rdp->blimit = LONG_MAX;
-                force_quiescent_state(rsp, 0);
+                if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                    *rdp->nxttail[RCU_DONE_TAIL] != head)
+                        force_quiescent_state(rsp, 0);
+                rdp->n_force_qs_snap = rsp->n_force_qs;
+                rdp->qlen_last_fqs_check = rdp->qlen;
        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1523,6 +1553,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptable = preemptable;
        rdp->passed_quiesc_completed = lastcomp - 1;
+        rdp->qlen_last_fqs_check = 0;
+        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b40ac5706040..1823c6e20609 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -167,6 +167,10 @@ struct rcu_data {
        struct rcu_head *nxtlist;
        struct rcu_head **nxttail[RCU_NEXT_SIZE];
        long            qlen;           /* # of queued callbacks */
+        long            qlen_last_fqs_check;
+                                        /* qlen at last check for QS forcing */
+        unsigned long   n_force_qs_snap;
+                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
 #ifdef CONFIG_NO_HZ
@@ -302,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                      struct rcu_node *rnp,
+                                     struct rcu_node *rnp,
-                                      struct rcu_data *rdp);
+                                     struct rcu_data *rdp);
 static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c0cb783aa16a..ef2a58c2b9d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 * parent is to remove the need for rcu_read_unlock_special() to
 * make more than two attempts to acquire the target rcu_node's lock.
 *
+ * Returns 1 if there was previously a task blocking the current grace
+ * period on the specified rcu_node structure.
+ *
 * The caller must hold rnp->lock with irqs disabled.
 */
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                      struct rcu_node *rnp,
+                                     struct rcu_node *rnp,
-                                      struct rcu_data *rdp)
+                                     struct rcu_data *rdp)
 {
        int i;
        struct list_head *lp;
        struct list_head *lp_root;
+        int retval = rcu_preempted_readers(rnp);
        struct rcu_node *rnp_root = rcu_get_root(rsp);
        struct task_struct *tp;
        if (rnp == rnp_root) {
                WARN_ONCE(1, "Last CPU thought to be offlined?");
-                return;  /* Shouldn't happen: at least one CPU online. */
+                return 0;  /* Shouldn't happen: at least one CPU online. */
        }
        WARN_ON_ONCE(rnp != rdp->mynode &&
                     (!list_empty(&rnp->blocked_tasks[0]) ||
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
                        spin_unlock(&rnp_root->lock); /* irqs remain disabled */
                }
        }
+        return retval;
 }
 /*
@@ -393,6 +399,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 EXPORT_SYMBOL_GPL(call_rcu);
 /*
+ * Wait for an rcu-preempt grace period.  We are supposed to expedite the
+ * grace period, but this is the crude slow compatability hack, so just
+ * invoke synchronize_rcu().
+ */
+void synchronize_rcu_expedited(void)
+{
+        synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
 * Check to see if there is any immediate preemptable-RCU-related work
 * to be done.
 */
@@ -521,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 /*
 * Because preemptable RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections.
+ * tasks that were blocked within RCU read-side critical sections, and
+ * such non-existent tasks cannot possibly have been blocking the current
+ * grace period.
 */
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                      struct rcu_node *rnp,
+                                     struct rcu_node *rnp,
-                                      struct rcu_data *rdp)
+                                     struct rcu_data *rdp)
 {
+        return 0;
 }
 /*
@@ -565,6 +585,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 EXPORT_SYMBOL_GPL(call_rcu);
 /*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptable RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+        synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
 * Because preemptable RCU does not exist, it never has any work to do.
 */
 static int rcu_preempt_pending(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index 76c0e9691fc0..a455dca884a6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -676,6 +676,7 @@ inline void update_rq_clock(struct rq *rq)
 /**
 * runqueue_is_locked
+ * @cpu: the processor in question.
 *
 * Returns true if the current cpu runqueue is locked.
 * This interface allows printk to be called with the runqueue lock
@@ -1563,11 +1564,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-struct update_shares_data {
+static __read_mostly unsigned long *update_shares_data;
-        unsigned long rq_weight[NR_CPUS];
-};
-static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1577,12 +1574,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 static void update_group_shares_cpu(struct task_group *tg, int cpu,
                                    unsigned long sd_shares,
                                    unsigned long sd_rq_weight,
-                                    struct update_shares_data *usd)
+                                    unsigned long *usd_rq_weight)
 {
        unsigned long shares, rq_weight;
        int boost = 0;
-        rq_weight = usd->rq_weight[cpu];
+        rq_weight = usd_rq_weight[cpu];
        if (!rq_weight) {
                boost = 1;
                rq_weight = NICE_0_LOAD;
@@ -1617,7 +1614,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 static int tg_shares_up(struct task_group *tg, void *data)
 {
        unsigned long weight, rq_weight = 0, shares = 0;
-        struct update_shares_data *usd;
+        unsigned long *usd_rq_weight;
        struct sched_domain *sd = data;
        unsigned long flags;
        int i;
@@ -1626,11 +1623,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
                return 0;
        local_irq_save(flags);
-        usd = &__get_cpu_var(update_shares_data);
+        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
        for_each_cpu(i, sched_domain_span(sd)) {
                weight = tg->cfs_rq[i]->load.weight;
-                usd->rq_weight[i] = weight;
+                usd_rq_weight[i] = weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
@@ -1651,7 +1648,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                shares = tg->shares;
        for_each_cpu(i, sched_domain_span(sd))
-                update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
        local_irq_restore(flags);
@@ -2311,7 +2308,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        struct rq *rq;
+        struct rq *rq, *orig_rq;
        if (!sched_feat(SYNC_WAKEUPS))
                wake_flags &= ~WF_SYNC;
@@ -2319,7 +2316,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        this_cpu = get_cpu();
        smp_wmb();
-        rq = task_rq_lock(p, &flags);
+        rq = orig_rq = task_rq_lock(p, &flags);
        update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
@@ -2350,6 +2347,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
                set_task_cpu(p, cpu);
        rq = task_rq_lock(p, &flags);
+        if (rq != orig_rq)
+                update_rq_clock(rq);
        WARN_ON(p->state != TASK_WAKING);
        cpu = task_cpu(p);
@@ -3656,6 +3657,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
 * @group: sched_group whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
@@ -6718,9 +6720,6 @@ EXPORT_SYMBOL(yield);
 /*
 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
 * that process accounting knows that this is a task in IO wait state.
- *
- * But don't do that if it is a deliberate, throttling IO wait (this task
- * has set its backing_dev_info: the queue against which it should throttle)
 */
 void __sched io_schedule(void)
 {
@@ -9404,6 +9403,10 @@ void __init sched_init(void)
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_GROUP_SCHED */
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
+        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
+                                            __alignof__(unsigned long));
+#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eeda..c32c3e643daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        struct sched_entity *buddy;
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+        if (cfs_rq->next) {
-                return cfs_rq->next;
+                buddy = cfs_rq->next;
+                cfs_rq->next = NULL;
+                if (wakeup_preempt_entity(buddy, se) < 1)
+                        return buddy;
+        }
-        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+        if (cfs_rq->last) {
-                return cfs_rq->last;
+                buddy = cfs_rq->last;
+                cfs_rq->last = NULL;
+                if (wakeup_preempt_entity(buddy, se) < 1)
+                        return buddy;
+        }
        return se;
 }
@@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
-                /*
-                 * If se was a buddy, clear it so that it will have to earn
-                 * the favour again.
-                 *
-                 * If se was not a buddy, clear the buddies because neither
-                 * was elegible to run, let them earn it again.
-                 *
-                 * IOW. unconditionally clear buddies.
-                 */
-                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e0..ce17760d9c51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid)
        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
+        if (err > 0)
+                proc_sid_connector(group_leader);
        return err;
 }
@@ -1546,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        if (arg4 | arg5)
                                return -EINVAL;
                        switch (arg2) {
-                        case 0:
+                        case PR_MCE_KILL_CLEAR:
                                if (arg3 != 0)
                                        return -EINVAL;
                                current->flags &= ~PF_MCE_PROCESS;
                                break;
-                        case 1:
+                        case PR_MCE_KILL_SET:
                                current->flags |= PF_MCE_PROCESS;
-                                if (arg3 != 0)
+                                if (arg3 == PR_MCE_KILL_EARLY)
                                        current->flags |= PF_MCE_EARLY;
-                                else
+                                else if (arg3 == PR_MCE_KILL_LATE)
                                        current->flags &= ~PF_MCE_EARLY;
+                                else if (arg3 == PR_MCE_KILL_DEFAULT)
+                                        current->flags &=
+                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
+                                else
+                                        return -EINVAL;
                                break;
                        default:
                                return -EINVAL;
                        }
                        error = 0;
                        break;
+                case PR_MCE_KILL_GET:
+                        if (arg2 | arg3 | arg4 | arg5)
+                                return -EINVAL;
+                        if (current->flags & PF_MCE_PROCESS)
+                                error = (current->flags & PF_MCE_EARLY) ?
+                                        PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                        else
+                                error = PR_MCE_KILL_DEFAULT;
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..b6e7aaea4604 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                        if (!table->ctl_name && table->strategy)
                                set_fail(&fail, table, "Strategy without ctl_name");
 #endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
                        if (table->procname && !table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
 #endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 37ba67e33265..9c451a1930b6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -740,7 +740,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 out:
        mutex_unlock(&ftrace_profile_lock);
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4ff01970547..3ffa502fb243 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -483,7 +483,7 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
-static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+static inline u64 rb_time_stamp(struct ring_buffer *buffer)
 {
        /* shift to debug/test normalization and TIME_EXTENTS */
        return buffer->clock() << DEBUG_SHIFT;
@@ -494,7 +494,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
        u64 time;
        preempt_disable_notrace();
-        time = rb_time_stamp(buffer, cpu);
+        time = rb_time_stamp(buffer);
        preempt_enable_no_resched_notrace();
        return time;
@@ -599,7 +599,7 @@ static struct list_head *rb_list_head(struct list_head *list)
 }
 /*
- * rb_is_head_page - test if the give page is the head page
+ * rb_is_head_page - test if the given page is the head page
 *
 * Because the reader may move the head_page pointer, we can
 * not trust what the head page is (it may be pointing to
@@ -1868,7 +1868,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-                *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+                *ts = rb_time_stamp(buffer);
                next_page->page->time_stamp = *ts;
        }
@@ -2111,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                goto out_fail;
-        ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+        ts = rb_time_stamp(cpu_buffer->buffer);
        /*
         * Only the first commit can update the timestamp.
@@ -2681,7 +2681,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
 /**
- * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * ring_buffer_overruns - get the number of overruns in buffer
 * @buffer: The ring buffer
 *
 * Returns the total number of overruns in the ring buffer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45068269ebb1..b20d3ec75de9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1393,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr,
 int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
-        return trace_array_printk(&global_trace, ip, fmt, args);
+        return trace_array_vprintk(&global_trace, ip, fmt, args);
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
@@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        return ret;
        }
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
@@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
        }
        mutex_unlock(&trace_types_lock);
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
@@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
        if (err)
                return err;
-        filp->f_pos += ret;
+        *ppos += ret;
        return ret;
 }
@@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                }
        }
-        filp->f_pos += cnt;
+        *ppos += cnt;
        /* If check pages failed, return ENOMEM */
        if (tracing_disabled)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 23245785927f..98a6cc5c64ed 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -933,8 +933,9 @@ static void postfix_clear(struct filter_parse_state *ps)
        while (!list_empty(&ps->postfix)) {
                elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
-                kfree(elt->operand);
                list_del(&elt->list);
+                kfree(elt->operand);
+                kfree(elt);
        }
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed17565826b0..b6c12c6a1bcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
 * @s: trace sequence descriptor
 * @fmt: printf format string
 *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
 * The tracer may use either sequence operations or its own
 * copy to user routines. To simplify formating of a trace
 * trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        s->len += ret;
-        return len;
+        return 1;
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..12328147132c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
 EXPORT_SYMBOL(schedule_delayed_work);
 /**
+ * flush_delayed_work - block until a dwork_struct's callback has terminated
+ * @dwork: the delayed work which is to be flushed
+ *
+ * Any timeout is cancelled, and any pending work is run immediately.
+ */
+void flush_delayed_work(struct delayed_work *dwork)
+{
+        if (del_timer_sync(&dwork->timer)) {
+                struct cpu_workqueue_struct *cwq;
+                cwq = wq_per_cpu(keventd_wq, get_cpu());
+                __queue_work(cwq, &dwork->work);
+                put_cpu();
+        }
+        flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
@@ -667,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
        int cpu;
+        int orig = -1;
        struct work_struct *works;
        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;
+        /*
+         * when running in keventd don't schedule a work item on itself.
+         * Can just call directly because the work queue is already bound.
+         * This also is faster.
+         * Make this a generic parameter for other workqueues?
+         */
+        if (current_is_keventd()) {
+                orig = raw_smp_processor_id();
+                INIT_WORK(per_cpu_ptr(works, orig), func);
+                func(per_cpu_ptr(works, orig));
+        }
        get_online_cpus();
        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);
+                if (cpu == orig)
+                        continue;
                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }
-        for_each_online_cpu(cpu)
+        for_each_online_cpu(cpu) {
-                flush_work(per_cpu_ptr(works, cpu));
+                if (cpu != orig)
+                        flush_work(per_cpu_ptr(works, cpu));
+        }
        put_online_cpus();
        free_percpu(works);
        return 0;