1 files changed, 239 insertions, 37 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..0d4632f7799b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
 {
-        unsigned int cpus = min_t(int, num_online_cpus(), 8);
+        unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
        unsigned int factor;
        switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        int factor = get_update_sysctl_factor();
+        unsigned int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 static unsigned int task_scan_min(struct task_struct *p)
 {
-        unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+        unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
        unsigned int scan, floor;
        unsigned int windows = 1;
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
        u64 runtime, period;
        spinlock_t *group_lock = NULL;
-        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        /*
+         * The p->mm->numa_scan_seq field gets updated without
+         * exclusive access. Use READ_ONCE() here to ensure
+         * that the field is read in a single access:
+         */
+        seq = READ_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        }
        rcu_read_lock();
-        tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+        tsk = READ_ONCE(cpu_rq(cpu)->curr);
        if (!cpupid_match_pid(tsk, cpupid))
                goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 static void reset_ptenuma_scan(struct task_struct *p)
 {
-        ACCESS_ONCE(p->mm->numa_scan_seq)++;
+        /*
+         * We only did a read acquisition of the mmap sem, so
+         * p->mm->numa_scan_seq is written to without exclusive access
+         * and the update is not guaranteed to be atomic. That's not
+         * much of an issue though, since this is just used for
+         * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+         * expensive, to avoid any form of compiler optimizations:
+         */
+        WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
        p->mm->numa_scan_offset = 0;
 }
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT           7
+static const unsigned char
+                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                        {0, 0, 0, 0, 0, 0, 0, 0},
+                                        {64, 32, 8, 0, 0, 0, 0, 0},
+                                        {96, 72, 40, 12, 1, 0, 0},
+                                        {112, 98, 75, 43, 15, 1, 0},
+                                        {120, 112, 98, 76, 45, 16, 2} };
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+        int j = 0;
+        if (!missed_updates)
+                return load;
+        if (missed_updates >= degrade_zero_ticks[idx])
+                return 0;
+        if (idx == 1)
+                return load >> missed_updates;
+        while (missed_updates) {
+                if (missed_updates % 2)
+                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+                missed_updates >>= 1;
+                j++;
+        }
+        return load;
+}
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                              unsigned long pending_updates)
+{
+        int i, scale;
+        this_rq->nr_load_updates++;
+        /* Update our load: */
+        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+                unsigned long old_load, new_load;
+                /* scale is effectively 1 << i now, and >> i divides by scale */
+                old_load = this_rq->cpu_load[i];
+                old_load = decay_load_missed(old_load, pending_updates - 1, i);
+                new_load = this_load;
+                /*
+                 * Round up the averaging division if load is increasing. This
+                 * prevents us from getting stuck on 9 if the load is 10, for
+                 * example.
+                 */
+                if (new_load > old_load)
+                        new_load += scale - 1;
+                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+        }
+        sched_avg_update(this_rq);
+}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+        unsigned long curr_jiffies = READ_ONCE(jiffies);
+        unsigned long load = this_rq->cfs.runnable_load_avg;
+        unsigned long pending_updates;
+        /*
+         * bail if there's load or we're actually up-to-date.
+         */
+        if (load || curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
+        __update_cpu_load(this_rq, load, pending_updates);
+}
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = READ_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+        unsigned long load = this_rq->cfs.runnable_load_avg;
+        /*
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+         */
+        this_rq->last_load_update_tick = jiffies;
+        __update_cpu_load(this_rq, load, 1);
+}
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
        unsigned long load_avg = rq->cfs.runnable_load_avg;
        if (nr_running)
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 }
 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
+        unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
-        if (numa_group) {
-                /* Task is already in the group's interleave set. */
-                if (node_isset(src_nid, numa_group->active_nodes))
-                        return false;
-                /* Task is moving into the group's interleave set. */
-                if (node_isset(dst_nid, numa_group->active_nodes))
-                        return true;
-                return group_faults(p, dst_nid) > group_faults(p, src_nid);
-        }
        /* Encourage migration to the preferred node. */
        if (dst_nid == p->numa_preferred_nid)
                return true;
-        return task_faults(p, dst_nid) > task_faults(p, src_nid);
+        /* Migrating away from the preferred node is bad. */
+        if (src_nid == p->numa_preferred_nid)
+                return false;
+        if (numa_group) {
+                src_faults = group_faults(p, src_nid);
+                dst_faults = group_faults(p, dst_nid);
+        } else {
+                src_faults = task_faults(p, src_nid);
+                dst_faults = task_faults(p, dst_nid);
+        }
+        return dst_faults > src_faults;
 }
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
+        unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
-        if (numa_group) {
+        /* Migrating away from the preferred node is bad. */
-                /* Task is moving within/into the group's interleave set. */
+        if (src_nid == p->numa_preferred_nid)
-                if (node_isset(dst_nid, numa_group->active_nodes))
+                return true;
-                        return false;
-                /* Task is moving out of the group's interleave set. */
+        /* Encourage migration to the preferred node. */
-                if (node_isset(src_nid, numa_group->active_nodes))
+        if (dst_nid == p->numa_preferred_nid)
-                        return true;
+                return false;
-                return group_faults(p, dst_nid) < group_faults(p, src_nid);
+        if (numa_group) {
+                src_faults = group_faults(p, src_nid);
+                dst_faults = group_faults(p, dst_nid);
+        } else {
+                src_faults = task_faults(p, src_nid);
+                dst_faults = task_faults(p, dst_nid);
        }
-        /* Migrating away from the preferred node is always bad. */
+        return dst_faults < src_faults;
-        if (src_nid == p->numa_preferred_nid)
-                return true;
-        return task_faults(p, dst_nid) < task_faults(p, src_nid);
 }
 #else
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu)
         * Since we're reading these variables without serialization make sure
         * we read them once before doing sanity checks on them.
         */
-        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        age_stamp = READ_ONCE(rq->age_stamp);
-        avg = ACCESS_ONCE(rq->rt_avg);
+        avg = READ_ONCE(rq->rt_avg);
        delta = __rq_clock_broken(rq) - age_stamp;
        if (unlikely(delta < 0))

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..0d4632f7799b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
141	*	141	*
142	* This idea comes from the SD scheduler of Con Kolivas:	142	* This idea comes from the SD scheduler of Con Kolivas:
143	*/	143	*/
144	static int get_update_sysctl_factor(void)	144	static unsigned int get_update_sysctl_factor(void)
145	{	145	{
146	unsigned int cpus = min_t(int, num_online_cpus(), 8);	146	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147	unsigned int factor;	147	unsigned int factor;
148		148
149	switch (sysctl_sched_tunable_scaling) {	149	switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
576	loff_t *ppos)	576	loff_t *ppos)
577	{	577	{
578	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);	578	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579	int factor = get_update_sysctl_factor();	579	unsigned int factor = get_update_sysctl_factor();
580		580
581	if (ret \|\| !write)	581	if (ret \|\| !write)
582	return ret;	582	return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
834		834
835	static unsigned int task_scan_min(struct task_struct *p)	835	static unsigned int task_scan_min(struct task_struct *p)
836	{	836	{
837	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);	837	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
838	unsigned int scan, floor;	838	unsigned int scan, floor;
839	unsigned int windows = 1;	839	unsigned int windows = 1;
840		840
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
1794	u64 runtime, period;	1794	u64 runtime, period;
1795	spinlock_t *group_lock = NULL;	1795	spinlock_t *group_lock = NULL;
1796		1796
1797	seq = ACCESS_ONCE(p->mm->numa_scan_seq);	1797	/*
		1798	* The p->mm->numa_scan_seq field gets updated without
		1799	* exclusive access. Use READ_ONCE() here to ensure
		1800	* that the field is read in a single access:
		1801	*/
		1802	seq = READ_ONCE(p->mm->numa_scan_seq);
1798	if (p->numa_scan_seq == seq)	1803	if (p->numa_scan_seq == seq)
1799	return;	1804	return;
1800	p->numa_scan_seq = seq;	1805	p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1938	}	1943	}
1939		1944
1940	rcu_read_lock();	1945	rcu_read_lock();
1941	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);	1946	tsk = READ_ONCE(cpu_rq(cpu)->curr);
1942		1947
1943	if (!cpupid_match_pid(tsk, cpupid))	1948	if (!cpupid_match_pid(tsk, cpupid))
1944	goto no_join;	1949	goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2107		2112
2108	static void reset_ptenuma_scan(struct task_struct *p)	2113	static void reset_ptenuma_scan(struct task_struct *p)
2109	{	2114	{
2110	ACCESS_ONCE(p->mm->numa_scan_seq)++;	2115	/*
		2116	* We only did a read acquisition of the mmap sem, so
		2117	* p->mm->numa_scan_seq is written to without exclusive access
		2118	* and the update is not guaranteed to be atomic. That's not
		2119	* much of an issue though, since this is just used for
		2120	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
		2121	* expensive, to avoid any form of compiler optimizations:
		2122	*/
		2123	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2111	p->mm->numa_scan_offset = 0;	2124	p->mm->numa_scan_offset = 0;
2112	}	2125	}
2113		2126
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
4323	}	4336	}
4324		4337
4325	#ifdef CONFIG_SMP	4338	#ifdef CONFIG_SMP
		4339
		4340	/*
		4341	* per rq 'load' arrray crap; XXX kill this.
		4342	*/
		4343
		4344	/*
		4345	* The exact cpuload at various idx values, calculated at every tick would be
		4346	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
		4347	*
		4348	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
		4349	* on nth tick when cpu may be busy, then we have:
		4350	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
		4351	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
		4352	*
		4353	* decay_load_missed() below does efficient calculation of
		4354	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
		4355	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
		4356	*
		4357	* The calculation is approximated on a 128 point scale.
		4358	* degrade_zero_ticks is the number of ticks after which load at any
		4359	* particular idx is approximated to be zero.
		4360	* degrade_factor is a precomputed table, a row for each load idx.
		4361	* Each column corresponds to degradation factor for a power of two ticks,
		4362	* based on 128 point scale.
		4363	* Example:
		4364	* row 2, col 3 (=12) says that the degradation at load idx 2 after
		4365	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
		4366	*
		4367	* With this power of 2 load factors, we can degrade the load n times
		4368	* by looking at 1 bits in n and doing as many mult/shift instead of
		4369	* n mult/shifts needed by the exact degradation.
		4370	*/
		4371	#define DEGRADE_SHIFT 7
		4372	static const unsigned char
		4373	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
		4374	static const unsigned char
		4375	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
		4376	{0, 0, 0, 0, 0, 0, 0, 0},
		4377	{64, 32, 8, 0, 0, 0, 0, 0},
		4378	{96, 72, 40, 12, 1, 0, 0},
		4379	{112, 98, 75, 43, 15, 1, 0},
		4380	{120, 112, 98, 76, 45, 16, 2} };
		4381
		4382	/*
		4383	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
		4384	* would be when CPU is idle and so we just decay the old load without
		4385	* adding any new load.
		4386	*/
		4387	static unsigned long
		4388	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
		4389	{
		4390	int j = 0;
		4391
		4392	if (!missed_updates)
		4393	return load;
		4394
		4395	if (missed_updates >= degrade_zero_ticks[idx])
		4396	return 0;
		4397
		4398	if (idx == 1)
		4399	return load >> missed_updates;
		4400
		4401	while (missed_updates) {
		4402	if (missed_updates % 2)
		4403	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
		4404
		4405	missed_updates >>= 1;
		4406	j++;
		4407	}
		4408	return load;
		4409	}
		4410
		4411	/*
		4412	* Update rq->cpu_load[] statistics. This function is usually called every
		4413	* scheduler tick (TICK_NSEC). With tickless idle this will not be called
		4414	* every tick. We fix it up based on jiffies.
		4415	*/
		4416	static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
		4417	unsigned long pending_updates)
		4418	{
		4419	int i, scale;
		4420
		4421	this_rq->nr_load_updates++;
		4422
		4423	/* Update our load: */
		4424	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
		4425	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
		4426	unsigned long old_load, new_load;
		4427
		4428	/* scale is effectively 1 << i now, and >> i divides by scale */
		4429
		4430	old_load = this_rq->cpu_load[i];
		4431	old_load = decay_load_missed(old_load, pending_updates - 1, i);
		4432	new_load = this_load;
		4433	/*
		4434	* Round up the averaging division if load is increasing. This
		4435	* prevents us from getting stuck on 9 if the load is 10, for
		4436	* example.
		4437	*/
		4438	if (new_load > old_load)
		4439	new_load += scale - 1;
		4440
		4441	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
		4442	}
		4443
		4444	sched_avg_update(this_rq);
		4445	}
		4446
		4447	#ifdef CONFIG_NO_HZ_COMMON
		4448	/*
		4449	* There is no sane way to deal with nohz on smp when using jiffies because the
		4450	* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
		4451	* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
		4452	*
		4453	* Therefore we cannot use the delta approach from the regular tick since that
		4454	* would seriously skew the load calculation. However we'll make do for those
		4455	* updates happening while idle (nohz_idle_balance) or coming out of idle
		4456	* (tick_nohz_idle_exit).
		4457	*
		4458	* This means we might still be one tick off for nohz periods.
		4459	*/
		4460
		4461	/*
		4462	* Called from nohz_idle_balance() to update the load ratings before doing the
		4463	* idle balance.
		4464	*/
		4465	static void update_idle_cpu_load(struct rq *this_rq)
		4466	{
		4467	unsigned long curr_jiffies = READ_ONCE(jiffies);
		4468	unsigned long load = this_rq->cfs.runnable_load_avg;
		4469	unsigned long pending_updates;
		4470
		4471	/*
		4472	* bail if there's load or we're actually up-to-date.
		4473	*/
		4474	if (load \|\| curr_jiffies == this_rq->last_load_update_tick)
		4475	return;
		4476
		4477	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
		4478	this_rq->last_load_update_tick = curr_jiffies;
		4479
		4480	__update_cpu_load(this_rq, load, pending_updates);
		4481	}
		4482
		4483	/*
		4484	* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
		4485	*/
		4486	void update_cpu_load_nohz(void)
		4487	{
		4488	struct rq *this_rq = this_rq();
		4489	unsigned long curr_jiffies = READ_ONCE(jiffies);
		4490	unsigned long pending_updates;
		4491
		4492	if (curr_jiffies == this_rq->last_load_update_tick)
		4493	return;
		4494
		4495	raw_spin_lock(&this_rq->lock);
		4496	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
		4497	if (pending_updates) {
		4498	this_rq->last_load_update_tick = curr_jiffies;
		4499	/*
		4500	* We were idle, this means load 0, the current load might be
		4501	* !0 due to remote wakeups and the sort.
		4502	*/
		4503	__update_cpu_load(this_rq, 0, pending_updates);
		4504	}
		4505	raw_spin_unlock(&this_rq->lock);
		4506	}
		4507	#endif /* CONFIG_NO_HZ */
		4508
		4509	/*
		4510	* Called from scheduler_tick()
		4511	*/
		4512	void update_cpu_load_active(struct rq *this_rq)
		4513	{
		4514	unsigned long load = this_rq->cfs.runnable_load_avg;
		4515	/*
		4516	* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
		4517	*/
		4518	this_rq->last_load_update_tick = jiffies;
		4519	__update_cpu_load(this_rq, load, 1);
		4520	}
		4521
4326	/* Used instead of source_load when we know the type == 0 */	4522	/* Used instead of source_load when we know the type == 0 */
4327	static unsigned long weighted_cpuload(const int cpu)	4523	static unsigned long weighted_cpuload(const int cpu)
4328	{	4524	{
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu)
4375	static unsigned long cpu_avg_load_per_task(int cpu)	4571	static unsigned long cpu_avg_load_per_task(int cpu)
4376	{	4572	{
4377	struct rq *rq = cpu_rq(cpu);	4573	struct rq *rq = cpu_rq(cpu);
4378	unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);	4574	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4379	unsigned long load_avg = rq->cfs.runnable_load_avg;	4575	unsigned long load_avg = rq->cfs.runnable_load_avg;
4380		4576
4381	if (nr_running)	4577	if (nr_running)
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct p, struct lb_env env)
5467	}	5663	}
5468		5664
5469	#ifdef CONFIG_NUMA_BALANCING	5665	#ifdef CONFIG_NUMA_BALANCING
5470	/* Returns true if the destination node has incurred more faults */	5666	/*
		5667	* Returns true if the destination node is the preferred node.
		5668	* Needs to match fbq_classify_rq(): if there is a runnable task
		5669	* that is not on its preferred node, we should identify it.
		5670	*/
5471	static bool migrate_improves_locality(struct task_struct p, struct lb_env env)	5671	static bool migrate_improves_locality(struct task_struct p, struct lb_env env)
5472	{	5672	{
5473	struct numa_group *numa_group = rcu_dereference(p->numa_group);	5673	struct numa_group *numa_group = rcu_dereference(p->numa_group);
		5674	unsigned long src_faults, dst_faults;
5474	int src_nid, dst_nid;	5675	int src_nid, dst_nid;
5475		5676
5476	if (!sched_feat(NUMA_FAVOUR_HIGHER) \|\| !p->numa_faults \|\|	5677	if (!sched_feat(NUMA_FAVOUR_HIGHER) \|\| !p->numa_faults \|\|
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct p, struct lb_env env)
5484	if (src_nid == dst_nid)	5685	if (src_nid == dst_nid)
5485	return false;	5686	return false;
5486		5687
5487	if (numa_group) {
5488	/* Task is already in the group's interleave set. */
5489	if (node_isset(src_nid, numa_group->active_nodes))
5490	return false;
5491
5492	/* Task is moving into the group's interleave set. */
5493	if (node_isset(dst_nid, numa_group->active_nodes))
5494	return true;
5495
5496	return group_faults(p, dst_nid) > group_faults(p, src_nid);
5497	}
5498
5499	/* Encourage migration to the preferred node. */	5688	/* Encourage migration to the preferred node. */
5500	if (dst_nid == p->numa_preferred_nid)	5689	if (dst_nid == p->numa_preferred_nid)
5501	return true;	5690	return true;
5502		5691
5503	return task_faults(p, dst_nid) > task_faults(p, src_nid);	5692	/* Migrating away from the preferred node is bad. */
		5693	if (src_nid == p->numa_preferred_nid)
		5694	return false;
		5695
		5696	if (numa_group) {
		5697	src_faults = group_faults(p, src_nid);
		5698	dst_faults = group_faults(p, dst_nid);
		5699	} else {
		5700	src_faults = task_faults(p, src_nid);
		5701	dst_faults = task_faults(p, dst_nid);
		5702	}
		5703
		5704	return dst_faults > src_faults;
5504	}	5705	}
5505		5706
5506		5707
5507	static bool migrate_degrades_locality(struct task_struct p, struct lb_env env)	5708	static bool migrate_degrades_locality(struct task_struct p, struct lb_env env)
5508	{	5709	{
5509	struct numa_group *numa_group = rcu_dereference(p->numa_group);	5710	struct numa_group *numa_group = rcu_dereference(p->numa_group);
		5711	unsigned long src_faults, dst_faults;
5510	int src_nid, dst_nid;	5712	int src_nid, dst_nid;
5511		5713
5512	if (!sched_feat(NUMA) \|\| !sched_feat(NUMA_RESIST_LOWER))	5714	if (!sched_feat(NUMA) \|\| !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct p, struct lb_env env)
5521	if (src_nid == dst_nid)	5723	if (src_nid == dst_nid)
5522	return false;	5724	return false;
5523		5725
5524	if (numa_group) {	5726	/* Migrating away from the preferred node is bad. */
5525	/* Task is moving within/into the group's interleave set. */	5727	if (src_nid == p->numa_preferred_nid)
5526	if (node_isset(dst_nid, numa_group->active_nodes))	5728	return true;
5527	return false;
5528		5729
5529	/* Task is moving out of the group's interleave set. */	5730	/* Encourage migration to the preferred node. */
5530	if (node_isset(src_nid, numa_group->active_nodes))	5731	if (dst_nid == p->numa_preferred_nid)
5531	return true;	5732	return false;
5532		5733
5533	return group_faults(p, dst_nid) < group_faults(p, src_nid);	5734	if (numa_group) {
		5735	src_faults = group_faults(p, src_nid);
		5736	dst_faults = group_faults(p, dst_nid);
		5737	} else {
		5738	src_faults = task_faults(p, src_nid);
		5739	dst_faults = task_faults(p, dst_nid);
5534	}	5740	}
5535		5741
5536	/* Migrating away from the preferred node is always bad. */	5742	return dst_faults < src_faults;
5537	if (src_nid == p->numa_preferred_nid)
5538	return true;
5539
5540	return task_faults(p, dst_nid) < task_faults(p, src_nid);
5541	}	5743	}
5542		5744
5543	#else	5745	#else
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu)
6037	* Since we're reading these variables without serialization make sure	6239	* Since we're reading these variables without serialization make sure
6038	* we read them once before doing sanity checks on them.	6240	* we read them once before doing sanity checks on them.
6039	*/	6241	*/
6040	age_stamp = ACCESS_ONCE(rq->age_stamp);	6242	age_stamp = READ_ONCE(rq->age_stamp);
6041	avg = ACCESS_ONCE(rq->rt_avg);	6243	avg = READ_ONCE(rq->rt_avg);
6042	delta = __rq_clock_broken(rq) - age_stamp;	6244	delta = __rq_clock_broken(rq) - age_stamp;
6043		6245
6044	if (unlikely(delta < 0))	6246	if (unlikely(delta < 0))