16 files changed, 424 insertions, 359 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
        unsigned long cpu_limit;
-        /* Thread group counters. */
+        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        thread_group_cputime_init(sig);
-        cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
                sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->hardirq_context = 0;
        p->softirq_context = 0;
 #endif
+        p->pagefault_disabled = 0;
 #ifdef CONFIG_LOCKDEP
        p->lockdep_depth = 0; /* no locks held yet */
        p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..f9984c363e9a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
 /*
 * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
 */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 {
        struct task_struct *p = q->task;
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
                return;
        /*
-         * We set q->lock_ptr = NULL _before_ we wake up the task. If
+         * Queue the task for later wakeup for after we've released
-         * a non-futex wake up happens on another CPU then the task
+         * the hb->lock. wake_q_add() grabs reference to p.
-         * might exit and p would dereference a non-existing task
-         * struct. Prevent this by holding a reference on p across the
-         * wake up.
         */
-        get_task_struct(p);
+        wake_q_add(wake_q, p);
        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
         */
        smp_wmb();
        q->lock_ptr = NULL;
-        wake_up_state(p, TASK_NORMAL);
-        put_task_struct(p);
 }
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
        struct futex_q *this, *next;
        union futex_key key = FUTEX_KEY_INIT;
        int ret;
+        WAKE_Q(wake_q);
        if (!bitset)
                return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
                        if (!(this->bitset & bitset))
                                continue;
-                        wake_futex(this);
+                        mark_wake_futex(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }
        spin_unlock(&hb->lock);
+        wake_up_q(&wake_q);
 out_put_key:
        put_futex_key(&key);
 out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
        int ret, op_ret;
+        WAKE_Q(wake_q);
 retry:
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
                                ret = -EINVAL;
                                goto out_unlock;
                        }
-                        wake_futex(this);
+                        mark_wake_futex(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
@@ -1334,7 +1332,7 @@ retry_private:
                                        ret = -EINVAL;
                                        goto out_unlock;
                                }
-                                wake_futex(this);
+                                mark_wake_futex(&wake_q, this);
                                if (++op_ret >= nr_wake2)
                                        break;
                        }
@@ -1344,6 +1342,7 @@ retry_private:
 out_unlock:
        double_unlock_hb(hb1, hb2);
+        wake_up_q(&wake_q);
 out_put_keys:
        put_futex_key(&key2);
 out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
+        WAKE_Q(wake_q);
        if (requeue_pi) {
                /*
@@ -1679,7 +1679,7 @@ retry_private:
                 * woken by futex_unlock_pi().
                 */
                if (++task_count <= nr_wake && !requeue_pi) {
-                        wake_futex(this);
+                        mark_wake_futex(&wake_q, this);
                        continue;
                }
@@ -1719,6 +1719,7 @@ retry_private:
 out_unlock:
        free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
+        wake_up_q(&wake_q);
        hb_waiters_dec(hb2);
        /*
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
 #include "sched.h"
 #include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
        p->signal->autogroup = autogroup_kref_get(ag);
-        if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+        if (!READ_ONCE(sysctl_sched_autogroup_enabled))
                goto out;
        for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
-        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
        if (enabled && task_wants_autogroup(p, tg))
                return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291ffb..20b858f2db22 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
 static bool set_nr_if_polling(struct task_struct *p)
 {
        struct thread_info *ti = task_thread_info(p);
-        typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+        typeof(ti->flags) old, val = READ_ONCE(ti->flags);
        for (;;) {
                if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+        struct wake_q_node *node = &task->wake_q;
+        /*
+         * Atomically grab the task, if ->wake_q is !nil already it means
+         * its already queued (either by us or someone else) and will get the
+         * wakeup due to that.
+         *
+         * This cmpxchg() implies a full barrier, which pairs with the write
+         * barrier implied by the wakeup in wake_up_list().
+         */
+        if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+                return;
+        get_task_struct(task);
+        /*
+         * The head is context local, there can be no concurrency.
+         */
+        *head->lastp = node;
+        head->lastp = &node->next;
+}
+void wake_up_q(struct wake_q_head *head)
+{
+        struct wake_q_node *node = head->first;
+        while (node != WAKE_Q_TAIL) {
+                struct task_struct *task;
+                task = container_of(node, struct task_struct, wake_q);
+                BUG_ON(!task);
+                /* task can safely be re-inserted now */
+                node = node->next;
+                task->wake_q.next = NULL;
+                /*
+                 * wake_up_process() implies a wmb() to pair with the queueing
+                 * in wake_q_add() so as not to miss wakeups.
+                 */
+                wake_up_process(task);
+                put_task_struct(task);
+        }
+}
 /*
 * resched_curr - mark rq's current task 'to be rescheduled now'.
 *
@@ -2397,9 +2443,9 @@ unsigned long nr_iowait_cpu(int cpu)
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
-        struct rq *this = this_rq();
+        struct rq *rq = this_rq();
-        *nr_waiters = atomic_read(&this->nr_iowait);
+        *nr_waiters = atomic_read(&rq->nr_iowait);
-        *load = this->cpu_load[0];
+        *load = rq->load.weight;
 }
 #ifdef CONFIG_SMP
@@ -2497,6 +2543,7 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        update_cpu_load_active(rq);
+        calc_global_load_tick(rq);
        raw_spin_unlock(&rq->lock);
        perf_event_task_tick();
@@ -2525,7 +2572,7 @@ void scheduler_tick(void)
 u64 scheduler_tick_max_deferment(void)
 {
        struct rq *rq = this_rq();
-        unsigned long next, now = ACCESS_ONCE(jiffies);
+        unsigned long next, now = READ_ONCE(jiffies);
        next = rq->last_sched_tick + HZ;
@@ -2726,9 +2773,7 @@ again:
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
 *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * WARNING: must be called with preemption disabled!
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
 */
 static void __sched __schedule(void)
 {
@@ -2737,7 +2782,6 @@ static void __sched __schedule(void)
        struct rq *rq;
        int cpu;
-        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_note_context_switch();
@@ -2801,8 +2845,6 @@ static void __sched __schedule(void)
                raw_spin_unlock_irq(&rq->lock);
        post_schedule(rq);
-        sched_preempt_enable_no_resched();
 }
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2865,9 @@ asmlinkage __visible void __sched schedule(void)
        sched_submit_work(tsk);
        do {
+                preempt_disable();
                __schedule();
+                sched_preempt_enable_no_resched();
        } while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2862,15 +2906,14 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
        do {
-                __preempt_count_add(PREEMPT_ACTIVE);
+                preempt_active_enter();
                __schedule();
-                __preempt_count_sub(PREEMPT_ACTIVE);
+                preempt_active_exit();
                /*
                 * Check again in case we missed a preemption opportunity
                 * between schedule and now.
                 */
-                barrier();
        } while (need_resched());
 }
@@ -2917,7 +2960,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                return;
        do {
-                __preempt_count_add(PREEMPT_ACTIVE);
+                preempt_active_enter();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,8 +2970,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                __schedule();
                exception_exit(prev_ctx);
-                __preempt_count_sub(PREEMPT_ACTIVE);
+                preempt_active_exit();
-                barrier();
        } while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_context);
@@ -2952,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
        prev_state = exception_enter();
        do {
-                __preempt_count_add(PREEMPT_ACTIVE);
+                preempt_active_enter();
                local_irq_enable();
                __schedule();
                local_irq_disable();
-                __preempt_count_sub(PREEMPT_ACTIVE);
+                preempt_active_exit();
-                /*
-                 * Check again in case we missed a preemption opportunity
-                 * between schedule and now.
-                 */
-                barrier();
        } while (need_resched());
        exception_exit(prev_state);
@@ -5314,7 +5350,7 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
 {
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
@@ -7734,11 +7770,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
        return rt_runtime_us;
 }
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
        u64 rt_runtime, rt_period;
-        rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+        rt_period = rt_period_us * NSEC_PER_USEC;
        rt_runtime = tg->rt_bandwidth.rt_runtime;
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
 {
        cputime_t old;
-        while (new > (old = ACCESS_ONCE(*counter)))
+        while (new > (old = READ_ONCE(*counter)))
                cmpxchg_cputime(counter, old, new);
 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..890ce951c717 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
        rq = cpu_rq(cpu);
        rcu_read_lock();
-        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+        curr = READ_ONCE(rq->curr); /* unlocked access */
        /*
         * If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..0d4632f7799b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
 {
-        unsigned int cpus = min_t(int, num_online_cpus(), 8);
+        unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
        unsigned int factor;
        switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        int factor = get_update_sysctl_factor();
+        unsigned int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 static unsigned int task_scan_min(struct task_struct *p)
 {
-        unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+        unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
        unsigned int scan, floor;
        unsigned int windows = 1;
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
        u64 runtime, period;
        spinlock_t *group_lock = NULL;
-        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        /*
+         * The p->mm->numa_scan_seq field gets updated without
+         * exclusive access. Use READ_ONCE() here to ensure
+         * that the field is read in a single access:
+         */
+        seq = READ_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        }
        rcu_read_lock();
-        tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+        tsk = READ_ONCE(cpu_rq(cpu)->curr);
        if (!cpupid_match_pid(tsk, cpupid))
                goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 static void reset_ptenuma_scan(struct task_struct *p)
 {
-        ACCESS_ONCE(p->mm->numa_scan_seq)++;
+        /*
+         * We only did a read acquisition of the mmap sem, so
+         * p->mm->numa_scan_seq is written to without exclusive access
+         * and the update is not guaranteed to be atomic. That's not
+         * much of an issue though, since this is just used for
+         * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+         * expensive, to avoid any form of compiler optimizations:
+         */
+        WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
        p->mm->numa_scan_offset = 0;
 }
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT           7
+static const unsigned char
+                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                        {0, 0, 0, 0, 0, 0, 0, 0},
+                                        {64, 32, 8, 0, 0, 0, 0, 0},
+                                        {96, 72, 40, 12, 1, 0, 0},
+                                        {112, 98, 75, 43, 15, 1, 0},
+                                        {120, 112, 98, 76, 45, 16, 2} };
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+        int j = 0;
+        if (!missed_updates)
+                return load;
+        if (missed_updates >= degrade_zero_ticks[idx])
+                return 0;
+        if (idx == 1)
+                return load >> missed_updates;
+        while (missed_updates) {
+                if (missed_updates % 2)
+                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+                missed_updates >>= 1;
+                j++;
+        }
+        return load;
+}
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                              unsigned long pending_updates)
+{
+        int i, scale;
+        this_rq->nr_load_updates++;
+        /* Update our load: */
+        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+                unsigned long old_load, new_load;
+                /* scale is effectively 1 << i now, and >> i divides by scale */
+                old_load = this_rq->cpu_load[i];
+                old_load = decay_load_missed(old_load, pending_updates - 1, i);
+                new_load = this_load;
+                /*
+                 * Round up the averaging division if load is increasing. This
+                 * prevents us from getting stuck on 9 if the load is 10, for
+                 * example.
+                 */
+                if (new_load > old_load)
+                        new_load += scale - 1;
+                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+        }
+        sched_avg_update(this_rq);
+}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+        unsigned long curr_jiffies = READ_ONCE(jiffies);
+        unsigned long load = this_rq->cfs.runnable_load_avg;
+        unsigned long pending_updates;
+        /*
+         * bail if there's load or we're actually up-to-date.
+         */
+        if (load || curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
+        __update_cpu_load(this_rq, load, pending_updates);
+}
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = READ_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+        unsigned long load = this_rq->cfs.runnable_load_avg;
+        /*
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+         */
+        this_rq->last_load_update_tick = jiffies;
+        __update_cpu_load(this_rq, load, 1);
+}
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
        unsigned long load_avg = rq->cfs.runnable_load_avg;
        if (nr_running)
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 }
 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
+        unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
-        if (numa_group) {
-                /* Task is already in the group's interleave set. */
-                if (node_isset(src_nid, numa_group->active_nodes))
-                        return false;
-                /* Task is moving into the group's interleave set. */
-                if (node_isset(dst_nid, numa_group->active_nodes))
-                        return true;
-                return group_faults(p, dst_nid) > group_faults(p, src_nid);
-        }
        /* Encourage migration to the preferred node. */
        if (dst_nid == p->numa_preferred_nid)
                return true;
-        return task_faults(p, dst_nid) > task_faults(p, src_nid);
+        /* Migrating away from the preferred node is bad. */
+        if (src_nid == p->numa_preferred_nid)
+                return false;
+        if (numa_group) {
+                src_faults = group_faults(p, src_nid);
+                dst_faults = group_faults(p, dst_nid);
+        } else {
+                src_faults = task_faults(p, src_nid);
+                dst_faults = task_faults(p, dst_nid);
+        }
+        return dst_faults > src_faults;
 }
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
+        unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
-        if (numa_group) {
+        /* Migrating away from the preferred node is bad. */
-                /* Task is moving within/into the group's interleave set. */
+        if (src_nid == p->numa_preferred_nid)
-                if (node_isset(dst_nid, numa_group->active_nodes))
+                return true;
-                        return false;
-                /* Task is moving out of the group's interleave set. */
+        /* Encourage migration to the preferred node. */
-                if (node_isset(src_nid, numa_group->active_nodes))
+        if (dst_nid == p->numa_preferred_nid)
-                        return true;
+                return false;
-                return group_faults(p, dst_nid) < group_faults(p, src_nid);
+        if (numa_group) {
+                src_faults = group_faults(p, src_nid);
+                dst_faults = group_faults(p, dst_nid);
+        } else {
+                src_faults = task_faults(p, src_nid);
+                dst_faults = task_faults(p, dst_nid);
        }
-        /* Migrating away from the preferred node is always bad. */
+        return dst_faults < src_faults;
-        if (src_nid == p->numa_preferred_nid)
-                return true;
-        return task_faults(p, dst_nid) < task_faults(p, src_nid);
 }
 #else
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu)
         * Since we're reading these variables without serialization make sure
         * we read them once before doing sanity checks on them.
         */
-        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        age_stamp = READ_ONCE(rq->age_stamp);
-        avg = ACCESS_ONCE(rq->rt_avg);
+        avg = READ_ONCE(rq->rt_avg);
        delta = __rq_clock_broken(rq) - age_stamp;
        if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f2..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
 /*
- *  kernel/sched/proc.c
+ * kernel/sched/loadavg.c
 *
- *  Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
 */
 #include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
        long nr_active, delta = 0;
        nr_active = this_rq->nr_running;
-        nr_active += (long) this_rq->nr_uninterruptible;
+        nr_active += (long)this_rq->nr_uninterruptible;
        if (nr_active != this_rq->calc_load_active) {
                delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
        delta = calc_load_fold_active(this_rq);
        if (delta) {
                int idx = calc_load_write_idx();
                atomic_long_add(delta, &calc_load_idle[idx]);
        }
 }
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
 {
        unsigned long result = 1UL << frac_bits;
-        if (n) for (;;) {
+        if (n) {
-                if (n & 1) {
+                for (;;) {
-                        result *= x;
+                        if (n & 1) {
-                        result += 1UL << (frac_bits - 1);
+                                result *= x;
-                        result >>= frac_bits;
+                                result += 1UL << (frac_bits - 1);
+                                result >>= frac_bits;
+                        }
+                        n >>= 1;
+                        if (!n)
+                                break;
+                        x *= x;
+                        x += 1UL << (frac_bits - 1);
+                        x >>= frac_bits;
                }
-                n >>= 1;
-                if (!n)
-                        break;
-                x *= x;
-                x += 1UL << (frac_bits - 1);
-                x >>= frac_bits;
        }
        return result;
@@ -285,7 +290,6 @@ static unsigned long
 calc_load_n(unsigned long load, unsigned long exp,
            unsigned long active, unsigned int n)
 {
        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
 }
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
 */
 void calc_global_load(unsigned long ticks)
 {
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
 }
 /*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
 * active count.
 */
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
 {
        long delta;
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
        this_rq->calc_load_update += LOAD_FREQ;
 }
-/*
- * End of global load-average stuff
- */
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT           7
-static const unsigned char
-                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                        {0, 0, 0, 0, 0, 0, 0, 0},
-                                        {64, 32, 8, 0, 0, 0, 0, 0},
-                                        {96, 72, 40, 12, 1, 0, 0},
-                                        {112, 98, 75, 43, 15, 1, 0},
-                                        {120, 112, 98, 76, 45, 16, 2} };
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-        int j = 0;
-        if (!missed_updates)
-                return load;
-        if (missed_updates >= degrade_zero_ticks[idx])
-                return 0;
-        if (idx == 1)
-                return load >> missed_updates;
-        while (missed_updates) {
-                if (missed_updates % 2)
-                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-                missed_updates >>= 1;
-                j++;
-        }
-        return load;
-}
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                              unsigned long pending_updates)
-{
-        int i, scale;
-        this_rq->nr_load_updates++;
-        /* Update our load: */
-        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-                unsigned long old_load, new_load;
-                /* scale is effectively 1 << i now, and >> i divides by scale */
-                old_load = this_rq->cpu_load[i];
-                old_load = decay_load_missed(old_load, pending_updates - 1, i);
-                new_load = this_load;
-                /*
-                 * Round up the averaging division if load is increasing. This
-                 * prevents us from getting stuck on 9 if the load is 10, for
-                 * example.
-                 */
-                if (new_load > old_load)
-                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-        }
-        sched_avg_update(this_rq);
-}
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-        return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-        return rq->load.weight;
-}
-#endif
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-        unsigned long load = get_rq_runnable_load(this_rq);
-        unsigned long pending_updates;
-        /*
-         * bail if there's load or we're actually up-to-date.
-         */
-        if (load || curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        this_rq->last_load_update_tick = curr_jiffies;
-        __update_cpu_load(this_rq, load, pending_updates);
-}
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-        struct rq *this_rq = this_rq();
-        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-        unsigned long pending_updates;
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        raw_spin_lock(&this_rq->lock);
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        if (pending_updates) {
-                this_rq->last_load_update_tick = curr_jiffies;
-                /*
-                 * We were idle, this means load 0, the current load might be
-                 * !0 due to remote wakeups and the sort.
-                 */
-                __update_cpu_load(this_rq, 0, pending_updates);
-        }
-        raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
-        unsigned long load = get_rq_runnable_load(this_rq);
-        /*
-         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-         */
-        this_rq->last_load_update_tick = jiffies;
-        __update_cpu_load(this_rq, load, 1);
-        calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..560d2fa623c3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
        rq = cpu_rq(cpu);
        rcu_read_lock();
-        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+        curr = READ_ONCE(rq->curr); /* unlocked access */
        /*
         * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..d85455539d5c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
+extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
+#ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
 /*
 * Helpers for converting nanosecond timing to jiffy resolution
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static inline u64 __rq_clock_broken(struct rq *rq)
 {
-        return ACCESS_ONCE(rq->clock);
+        return READ_ONCE(rq->clock);
 }
 static inline u64 rq_clock(struct rq *rq)
@@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq);
 extern void init_task_runnable_average(struct task_struct *p);
 static inline void add_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..077ebbd5e10f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
 {
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
+        /* Check if cputimer isn't running. This is accessed without locking. */
+        if (!READ_ONCE(cputimer->running))
                return false;
        /*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
        if (!cputimer_running(tsk))
                return;
-        raw_spin_lock(&cputimer->lock);
+        atomic64_add(cputime, &cputimer->cputime_atomic.utime);
-        cputimer->cputime.utime += cputime;
-        raw_spin_unlock(&cputimer->lock);
 }
 /**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
        if (!cputimer_running(tsk))
                return;
-        raw_spin_lock(&cputimer->lock);
+        atomic64_add(cputime, &cputimer->cputime_atomic.stime);
-        cputimer->cputime.stime += cputime;
-        raw_spin_unlock(&cputimer->lock);
 }
 /**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
        if (!cputimer_running(tsk))
                return;
-        raw_spin_lock(&cputimer->lock);
+        atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-        cputimer->cputime.sum_exec_runtime += ns;
-        raw_spin_unlock(&cputimer->lock);
 }
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..2ccec988d6b7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
 __sched int bit_wait_timeout(struct wait_bit_key *word)
 {
-        unsigned long now = ACCESS_ONCE(jiffies);
+        unsigned long now = READ_ONCE(jiffies);
        if (signal_pending_state(current->state, current))
                return 1;
        if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 __sched int bit_wait_io_timeout(struct wait_bit_key *word)
 {
-        unsigned long now = ACCESS_ONCE(jiffies);
+        unsigned long now = READ_ONCE(jiffies);
        if (signal_pending_state(current->state, current))
                return 1;
        if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..f19833b5db3c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
 * RETURNS:
 * %true if @mask is set, %false if made noop because @task was dying.
 */
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
        BUG_ON(mask & ~JOBCTL_PENDING_MASK);
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
        struct signal_struct *sig = current->signal;
        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
-                unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+                unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;
                /* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
        return 0;
 }
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-        if (b->utime > a->utime)
+        u64 curr_cputime;
-                a->utime = b->utime;
+retry:
+        curr_cputime = atomic64_read(cputime);
+        if (sum_cputime > curr_cputime) {
+                if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+                        goto retry;
+        }
+}
-        if (b->stime > a->stime)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
-                a->stime = b->stime;
+{
+        __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+        __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+        __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
-        if (b->sum_exec_runtime > a->sum_exec_runtime)
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-                a->sum_exec_runtime = b->sum_exec_runtime;
+static inline void sample_cputime_atomic(struct task_cputime *times,
+                                         struct task_cputime_atomic *atomic_times)
+{
+        times->utime = atomic64_read(&atomic_times->utime);
+        times->stime = atomic64_read(&atomic_times->stime);
+        times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
        struct task_cputime sum;
-        unsigned long flags;
-        if (!cputimer->running) {
+        /* Check if cputimer isn't running. This is accessed without locking. */
+        if (!READ_ONCE(cputimer->running)) {
                /*
                 * The POSIX timer interface allows for absolute time expiry
                 * values through the TIMER_ABSTIME flag, therefore we have
-                 * to synchronize the timer to the clock every time we start
+                 * to synchronize the timer to the clock every time we start it.
-                 * it.
                 */
                thread_group_cputime(tsk, &sum);
-                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                update_gt_cputime(&cputimer->cputime_atomic, &sum);
-                cputimer->running = 1;
-                update_gt_cputime(&cputimer->cputime, &sum);
+                /*
-        } else
+                 * We're setting cputimer->running without a lock. Ensure
-                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                 * this only gets written to in one operation. We set
-        *times = cputimer->cputime;
+                 * running after update_gt_cputime() as a small optimization,
-        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+                 * but barriers are not required because update_gt_cputime()
+                 * can handle concurrent updates.
+                 */
+                WRITE_ONCE(cputimer->running, 1);
+        }
+        sample_cputime_atomic(times, &cputimer->cputime_atomic);
 }
 /*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
        if (!task_cputime_zero(&tsk->cputime_expires))
                return false;
-        if (tsk->signal->cputimer.running)
+        /* Check if cputimer is running. This is accessed without locking. */
+        if (READ_ONCE(tsk->signal->cputimer.running))
                return false;
        return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
        /*
         * Check for the special case thread timers.
         */
-        soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+        soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
        if (soft != RLIM_INFINITY) {
                unsigned long hard =
-                        ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+                        READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
                if (hard != RLIM_INFINITY &&
                    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 }
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
 {
        struct thread_group_cputimer *cputimer = &sig->cputimer;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&cputimer->lock, flags);
+        /* Turn off cputimer->running. This is done without locking. */
-        cputimer->running = 0;
+        WRITE_ONCE(cputimer->running, 0);
-        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
                         SIGPROF);
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
                         SIGVTALRM);
-        soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+        soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (soft != RLIM_INFINITY) {
                unsigned long psecs = cputime_to_secs(ptime);
                unsigned long hard =
-                        ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+                        READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
                cputime_t x;
                if (psecs >= hard) {
                        /*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        }
        sig = tsk->signal;
-        if (sig->cputimer.running) {
+        /* Check if cputimer is running. This is accessed without locking. */
+        if (READ_ONCE(sig->cputimer.running)) {
                struct task_cputime group_sample;
-                raw_spin_lock(&sig->cputimer.lock);
+                sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
-                group_sample = sig->cputimer.cputime;
-                raw_spin_unlock(&sig->cputimer.lock);
                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                        return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
         * If there are any active process wide timers (POSIX 1.b, itimers,
         * RLIMIT_CPU) cputimer must be running.
         */
-        if (tsk->signal->cputimer.running)
+        if (READ_ONCE(tsk->signal->cputimer.running))
                check_process_timers(tsk, &firing);
        /*