19 files changed, 314 insertions, 84 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index b9d3bc6c21ec..ad7e51488291 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -97,6 +97,14 @@ static void __exit_signal(struct task_struct *tsk)
                sig->tty = NULL;
        } else {
                /*
+                 * This can only happen if the caller is de_thread().
+                 * FIXME: this is the temporary hack, we should teach
+                 * posix-cpu-timers to handle this case correctly.
+                 */
+                if (unlikely(has_group_leader_pid(tsk)))
+                        posix_cpu_timers_exit_group(tsk);
+                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
@@ -905,6 +913,15 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        /*
+         * If do_exit is called because this processes oopsed, it's possible
+         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+         * continuing. Amongst other possible reasons, this is to prevent
+         * mm_release()->clear_child_tid() from writing to a user-controlled
+         * kernel address.
+         */
+        set_fs(USER_DS);
        tracehook_report_exit(&code);
        validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index ab7f29d906c7..d01a7514125b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -279,6 +279,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
+        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..e328f574c97c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1363,7 +1363,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
-        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1375,7 +1374,6 @@ static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
        spin_unlock(&hb->lock);
-        drop_futex_key_refs(&q->key);
 }
 /**
@@ -1480,8 +1478,6 @@ static void unqueue_me_pi(struct futex_q *q)
        q->pi_state = NULL;
        spin_unlock(q->lock_ptr);
-        drop_futex_key_refs(&q->key);
 }
 /*
@@ -1812,7 +1808,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
        }
 retry:
-        /* Prepare to wait on uaddr. */
+        /*
+         * Prepare to wait on uaddr. On success, holds hb lock and increments
+         * q.key refs.
+         */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
                goto out;
@@ -1822,24 +1821,23 @@ retry:
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
+        /* unqueue_me() drops q.key ref */
        if (!unqueue_me(&q))
-                goto out_put_key;
+                goto out;
        ret = -ETIMEDOUT;
        if (to && !to->task)
-                goto out_put_key;
+                goto out;
        /*
         * We expect signal_pending(current), but we might be the
         * victim of a spurious wakeup as well.
         */
-        if (!signal_pending(current)) {
+        if (!signal_pending(current))
-                put_futex_key(fshared, &q.key);
                goto retry;
-        }
        ret = -ERESTARTSYS;
        if (!abs_time)
-                goto out_put_key;
+                goto out;
        restart = &current_thread_info()->restart_block;
        restart->fn = futex_wait_restart;
@@ -1856,8 +1854,6 @@ retry:
        ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
-        put_futex_key(fshared, &q.key);
 out:
        if (to) {
                hrtimer_cancel(&to->timer);
@@ -2236,7 +2232,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
-        /* Prepare to wait on uaddr. */
+        /*
+         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+         * count.
+         */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2254,7 +2253,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * In order for us to be here, we know our q.key == key2, and since
         * we took the hb->lock above, we also know that futex_requeue() has
         * completed and we no longer have to concern ourselves with a wakeup
-         * race with the atomic proxy lock acquition by the requeue code.
+         * race with the atomic proxy lock acquisition by the requeue code. The
+         * futex_requeue dropped our key1 reference and incremented our key2
+         * reference count.
         */
        /* Check if the requeue code acquired the second futex for us. */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..345e0b75fe1e 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_spurious_proc_show, NULL);
+        return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
 }
 static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        account_global_scheduler_latency(tsk, &lat);
-        /*
+        for (i = 0; i < tsk->latency_record_count; i++) {
-         * short term hack; if we're > 32 we stop; future we recycle:
-         */
-        tsk->latency_record_count++;
-        if (tsk->latency_record_count >= LT_SAVECOUNT)
-                goto out_unlock;
-        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *mylat;
                int same = 1;
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
                }
        }
+        /*
+         * short term hack; if we're > 32 we stop; future we recycle:
+         */
+        if (tsk->latency_record_count >= LT_SAVECOUNT)
+                goto out_unlock;
        /* Allocated a new one: */
-        i = tsk->latency_record_count;
+        i = tsk->latency_record_count++;
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b98bed3d8182..65b09a836cc3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1620,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 {
        raw_spin_lock(&ctx->lock);
-        /* Rotate the first entry last of non-pinned groups */
+        /*
-        list_rotate_left(&ctx->flexible_groups);
+         * Rotate the first entry last of non-pinned groups. Rotation might be
+         * disabled by the inheritance code.
+         */
+        if (!ctx->rotate_disable)
+                list_rotate_left(&ctx->flexible_groups);
        raw_spin_unlock(&ctx->lock);
 }
@@ -1773,7 +1777,13 @@ static u64 perf_event_read(struct perf_event *event)
                unsigned long flags;
                raw_spin_lock_irqsave(&ctx->lock, flags);
-                update_context_time(ctx);
+                /*
+                 * may read while context is not active
+                 * (e.g., thread is blocked), in that case
+                 * we cannot update context time
+                 */
+                if (ctx->is_active)
+                        update_context_time(ctx);
                update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -5616,6 +5626,7 @@ int perf_event_init_task(struct task_struct *child)
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
+        unsigned long flags;
        int ret = 0;
        child->perf_event_ctxp = NULL;
@@ -5656,6 +5667,15 @@ int perf_event_init_task(struct task_struct *child)
                        break;
        }
+        /*
+         * We can't hold ctx->lock when iterating the ->flexible_group list due
+         * to allocations, but we need to prevent rotation because
+         * rotate_ctx() will change the list from interrupt context.
+         */
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 1;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
                ret = inherit_task_group(event, parent, parent_ctx, child,
                                         &inherited_all);
@@ -5663,6 +5683,10 @@ int perf_event_init_task(struct task_struct *child)
                        break;
        }
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 0;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        child_ctx = child->perf_event_ctxp;
        if (child_ctx && inherited_all) {
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..0da2837416eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -120,10 +120,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        switch (o->type) {
        case PM_QOS_MIN:
-                return plist_last(&o->requests)->prio;
+                return plist_first(&o->requests)->prio;
        case PM_QOS_MAX:
-                return plist_first(&o->requests)->prio;
+                return plist_last(&o->requests)->prio;
        default:
                /* runtime check for not using enum */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..7a931a90e4a2 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -326,7 +326,6 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -338,7 +337,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -347,7 +346,10 @@ int hibernation_snapshot(int platform_mode)
                goto Recover_platform;
        error = create_image(platform_mode);
-        /* Control returns here after successful restore */
+        /*
+         * Control returns here (1) after the image has been created or the
+         * image creation has failed and (2) after a successful restore.
+         */
 Resume_devices:
        /* We may need to release the preallocated image pages here. */
@@ -356,7 +358,10 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
+        if (error || !in_suspend)
+                pm_restore_gfp_mask();
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -451,17 +456,16 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
        return error;
@@ -475,7 +479,6 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
-        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -491,7 +494,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -535,7 +537,6 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
@@ -643,6 +644,7 @@ int hibernate(void)
                swsusp_free();
                if (!error)
                        power_down();
+                pm_restore_gfp_mask();
        } else {
                pr_debug("PM: Image restored successfully.\n");
        }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..ecf770509d0d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
-        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_all_swap_pages(data->swap);
        if (data->frozen)
                thaw_processes();
-        pm_notifier_call_chain(data->mode == O_WRONLY ?
+        pm_notifier_call_chain(data->mode == O_RDONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
+                pm_restore_gfp_mask();
                thaw_processes();
                usermodehelper_enable();
                data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = -EPERM;
                        break;
                }
+                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
                if (!error)
                        error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 9dc8ea140426..2dc36b49d2d2 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1072,13 +1072,15 @@ void printk_tick(void)
 int printk_needs_cpu(int cpu)
 {
+        if (unlikely(cpu_is_offline(cpu)))
+                printk_tick();
        return per_cpu(printk_pending, cpu);
 }
 void wake_up_klogd(void)
 {
        if (!trace_override && waitqueue_active(&log_wait))
-                __raw_get_cpu_var(printk_pending) = 1;
+                this_cpu_write(printk_pending, 1);
 }
 /**
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..bf768d739e7d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
                child->exit_code = data;
                dead = __ptrace_detach(current, child);
                if (!child->exit_state)
-                        wake_up_process(child);
+                        wake_up_state(child, TASK_TRACED | TASK_STOPPED);
        }
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 093df593e45d..b3f2b4187859 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -583,7 +583,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * cases. LITMUS^RT amplifies the effects of this problem. Hence, we
         * turn it off to avoid stalling clocks. */
        /*
-        if (test_tsk_need_resched(p))
+        if (rq->curr->se.on_rq && test_tsk_need_resched(p))
                rq->skip_clock_update = 1;
        */
 }
@@ -741,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int i;
@@ -752,6 +752,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
        if (strncmp(buf, "NO_", 3) == 0) {
                neg = 1;
@@ -759,9 +760,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        }
        for (i = 0; sched_feat_names[i]; i++) {
-                int len = strlen(sched_feat_names[i]);
+                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                if (strncmp(cmp, sched_feat_names[i], len) == 0) {
                        if (neg)
                                sysctl_sched_features &= ~(1UL << i);
                        else
@@ -1877,12 +1876,6 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
-        if (task_has_rt_policy(p)) {
-                p->se.load.weight = 0;
-                p->se.load.inv_weight = WMULT_CONST;
-                return;
-        }
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
@@ -3011,6 +3004,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3040,6 +3042,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3049,6 +3173,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3066,24 +3194,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
@@ -3745,7 +3866,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
        if (prev->se.on_rq)
                update_rq_clock(rq);
-        rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3821,7 +3941,6 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        clear_tsk_need_resched(prev);
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3853,6 +3972,8 @@ need_resched_nonpreemptible:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
+        clear_tsk_need_resched(prev);
+        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
@@ -5439,7 +5560,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->se.exec_start = sched_clock();
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+        /*
+         * We're having a chicken and egg problem, even though we are
+         * holding rq->lock, the cpu isn't yet set to this cpu so the
+         * lockdep check in task_group() will fail.
+         *
+         * Similar case to sched_fork(). / Alternatively we could
+         * use task_rq_lock() here and obtain the other rq->lock.
+         *
+         * Silence PROVE_RCU
+         */
+        rcu_read_lock();
        __set_task_cpu(idle, cpu);
+        rcu_read_unlock();
        rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..1ba1ba4b42f8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,6 +194,24 @@ void generic_smp_call_function_interrupt(void)
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
+                /*
+                 * Since we walk the list without any locks, we might
+                 * see an entry that was completed, removed from the
+                 * list and is in the process of being reused.
+                 *
+                 * We must check that the cpu is in the cpumask before
+                 * checking the refs, and both must be set before
+                 * executing the callback on this cpu.
+                 */
+                if (!cpumask_test_cpu(cpu, data->cpumask))
+                        continue;
+                smp_rmb();
+                if (atomic_read(&data->refs) == 0)
+                        continue;
                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
                        continue;
@@ -202,6 +220,8 @@ void generic_smp_call_function_interrupt(void)
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
                if (!refs) {
+                        WARN_ON(!cpumask_empty(data->cpumask));
                        raw_spin_lock(&call_function.lock);
                        list_del_rcu(&data->csd.list);
                        raw_spin_unlock(&call_function.lock);
@@ -453,11 +473,21 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
        data->csd.func = func;
        data->csd.info = info;
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
+        /*
+         * To ensure the interrupt handler gets an complete view
+         * we order the cpumask and refs writes and order the read
+         * of them in the interrupt handler.  In addition we may
+         * only clear our own cpu bit from the mask.
+         */
+        smp_wmb();
        atomic_set(&data->refs, cpumask_weight(data->cpumask));
        raw_spin_lock_irqsave(&call_function.lock, flags);
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..66136ca33a7b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1377,7 +1377,8 @@ static int check_prlimit_permission(struct task_struct *task)
        const struct cred *cred = current_cred(), *tcred;
        tcred = __task_cred(task);
-        if ((cred->uid != tcred->euid ||
+        if (current != task &&
+            (cred->uid != tcred->euid ||
             cred->uid != tcred->suid ||
             cred->uid != tcred->uid  ||
             cred->gid != tcred->egid ||
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..102ad370dddb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
        struct tvec_base *base = __get_cpu_var(tvec_bases);
        unsigned long expires;
+        /*
+         * Pretend that there is no timer pending if the cpu is offline.
+         * Possible pending timers will be migrated later to an active cpu.
+         */
+        if (cpu_is_offline(smp_processor_id()))
+                return now + NEXT_TIMER_MAX_DELTA;
        spin_lock(&base->lock);
        if (time_before_eq(base->next_timer, base->timer_jiffies))
                base->next_timer = __next_timer_interrupt(base);
@@ -1316,7 +1322,7 @@ void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
        update_wall_time();
-        calc_global_load();
+        calc_global_load(ticks);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..7702f5aecd07 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2320,11 +2320,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
        return count;
 }
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+        if (file->f_mode & FMODE_READ)
+                return seq_lseek(file, offset, origin);
+        else
+                return 0;
+}
 static const struct file_operations tracing_fops = {
        .open           = tracing_open,
        .read           = seq_read,
        .write          = tracing_write_stub,
-        .llseek         = seq_lseek,
+        .llseek         = tracing_seek,
        .release        = tracing_release,
 };
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..8ce395f74d47 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -157,6 +157,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
+                        put_user_ns(ns);
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc1..e359b2e7e7d5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -377,7 +377,8 @@ static int watchdog_nmi_enable(int cpu)
                goto out_save;
        }
-        printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+        printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
+               cpu, PTR_ERR(event));
        return -1;
        /* success path */
@@ -440,9 +441,6 @@ static int watchdog_enable(int cpu)
                wake_up_process(p);
        }
-        /* if any cpu succeeds, watchdog is considered enabled for the system */
-        watchdog_enabled = 1;
        return 0;
 }
@@ -470,12 +468,16 @@ static void watchdog_disable(int cpu)
 static void watchdog_enable_all_cpus(void)
 {
        int cpu;
-        int result = 0;
+        watchdog_enabled = 0;
        for_each_online_cpu(cpu)
-                result += watchdog_enable(cpu);
+                if (!watchdog_enable(cpu))
+                        /* if any cpu succeeds, watchdog is considered
+                           enabled for the system */
+                        watchdog_enabled = 1;
-        if (result)
+        if (!watchdog_enabled)
                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
 }
@@ -503,10 +505,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
 {
        proc_dointvec(table, write, buffer, length, ppos);
-        if (watchdog_enabled)
+        if (write) {
-                watchdog_enable_all_cpus();
+                if (watchdog_enabled)
-        else
+                        watchdog_enable_all_cpus();
-                watchdog_disable_all_cpus();
+                else
+                        watchdog_disable_all_cpus();
+        }
        return 0;
 }