aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c276
1 files changed, 239 insertions, 37 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..0d4632f7799b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
141 * 141 *
142 * This idea comes from the SD scheduler of Con Kolivas: 142 * This idea comes from the SD scheduler of Con Kolivas:
143 */ 143 */
144static int get_update_sysctl_factor(void) 144static unsigned int get_update_sysctl_factor(void)
145{ 145{
146 unsigned int cpus = min_t(int, num_online_cpus(), 8); 146 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 unsigned int factor; 147 unsigned int factor;
148 148
149 switch (sysctl_sched_tunable_scaling) { 149 switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
576 loff_t *ppos) 576 loff_t *ppos)
577{ 577{
578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 int factor = get_update_sysctl_factor(); 579 unsigned int factor = get_update_sysctl_factor();
580 580
581 if (ret || !write) 581 if (ret || !write)
582 return ret; 582 return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
834 834
835static unsigned int task_scan_min(struct task_struct *p) 835static unsigned int task_scan_min(struct task_struct *p)
836{ 836{
837 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); 837 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
838 unsigned int scan, floor; 838 unsigned int scan, floor;
839 unsigned int windows = 1; 839 unsigned int windows = 1;
840 840
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
1794 u64 runtime, period; 1794 u64 runtime, period;
1795 spinlock_t *group_lock = NULL; 1795 spinlock_t *group_lock = NULL;
1796 1796
1797 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1797 /*
1798 * The p->mm->numa_scan_seq field gets updated without
1799 * exclusive access. Use READ_ONCE() here to ensure
1800 * that the field is read in a single access:
1801 */
1802 seq = READ_ONCE(p->mm->numa_scan_seq);
1798 if (p->numa_scan_seq == seq) 1803 if (p->numa_scan_seq == seq)
1799 return; 1804 return;
1800 p->numa_scan_seq = seq; 1805 p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1938 } 1943 }
1939 1944
1940 rcu_read_lock(); 1945 rcu_read_lock();
1941 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); 1946 tsk = READ_ONCE(cpu_rq(cpu)->curr);
1942 1947
1943 if (!cpupid_match_pid(tsk, cpupid)) 1948 if (!cpupid_match_pid(tsk, cpupid))
1944 goto no_join; 1949 goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2107 2112
2108static void reset_ptenuma_scan(struct task_struct *p) 2113static void reset_ptenuma_scan(struct task_struct *p)
2109{ 2114{
2110 ACCESS_ONCE(p->mm->numa_scan_seq)++; 2115 /*
2116 * We only did a read acquisition of the mmap sem, so
2117 * p->mm->numa_scan_seq is written to without exclusive access
2118 * and the update is not guaranteed to be atomic. That's not
2119 * much of an issue though, since this is just used for
2120 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2121 * expensive, to avoid any form of compiler optimizations:
2122 */
2123 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2111 p->mm->numa_scan_offset = 0; 2124 p->mm->numa_scan_offset = 0;
2112} 2125}
2113 2126
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4323} 4336}
4324 4337
4325#ifdef CONFIG_SMP 4338#ifdef CONFIG_SMP
4339
4340/*
4341 * per rq 'load' arrray crap; XXX kill this.
4342 */
4343
4344/*
4345 * The exact cpuload at various idx values, calculated at every tick would be
4346 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4347 *
4348 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4349 * on nth tick when cpu may be busy, then we have:
4350 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4351 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4352 *
4353 * decay_load_missed() below does efficient calculation of
4354 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4355 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4356 *
4357 * The calculation is approximated on a 128 point scale.
4358 * degrade_zero_ticks is the number of ticks after which load at any
4359 * particular idx is approximated to be zero.
4360 * degrade_factor is a precomputed table, a row for each load idx.
4361 * Each column corresponds to degradation factor for a power of two ticks,
4362 * based on 128 point scale.
4363 * Example:
4364 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4365 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4366 *
4367 * With this power of 2 load factors, we can degrade the load n times
4368 * by looking at 1 bits in n and doing as many mult/shift instead of
4369 * n mult/shifts needed by the exact degradation.
4370 */
4371#define DEGRADE_SHIFT 7
4372static const unsigned char
4373 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4374static const unsigned char
4375 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4376 {0, 0, 0, 0, 0, 0, 0, 0},
4377 {64, 32, 8, 0, 0, 0, 0, 0},
4378 {96, 72, 40, 12, 1, 0, 0},
4379 {112, 98, 75, 43, 15, 1, 0},
4380 {120, 112, 98, 76, 45, 16, 2} };
4381
4382/*
4383 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4384 * would be when CPU is idle and so we just decay the old load without
4385 * adding any new load.
4386 */
4387static unsigned long
4388decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4389{
4390 int j = 0;
4391
4392 if (!missed_updates)
4393 return load;
4394
4395 if (missed_updates >= degrade_zero_ticks[idx])
4396 return 0;
4397
4398 if (idx == 1)
4399 return load >> missed_updates;
4400
4401 while (missed_updates) {
4402 if (missed_updates % 2)
4403 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4404
4405 missed_updates >>= 1;
4406 j++;
4407 }
4408 return load;
4409}
4410
4411/*
4412 * Update rq->cpu_load[] statistics. This function is usually called every
4413 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4414 * every tick. We fix it up based on jiffies.
4415 */
4416static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4417 unsigned long pending_updates)
4418{
4419 int i, scale;
4420
4421 this_rq->nr_load_updates++;
4422
4423 /* Update our load: */
4424 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4425 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4426 unsigned long old_load, new_load;
4427
4428 /* scale is effectively 1 << i now, and >> i divides by scale */
4429
4430 old_load = this_rq->cpu_load[i];
4431 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4432 new_load = this_load;
4433 /*
4434 * Round up the averaging division if load is increasing. This
4435 * prevents us from getting stuck on 9 if the load is 10, for
4436 * example.
4437 */
4438 if (new_load > old_load)
4439 new_load += scale - 1;
4440
4441 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4442 }
4443
4444 sched_avg_update(this_rq);
4445}
4446
4447#ifdef CONFIG_NO_HZ_COMMON
4448/*
4449 * There is no sane way to deal with nohz on smp when using jiffies because the
4450 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4451 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4452 *
4453 * Therefore we cannot use the delta approach from the regular tick since that
4454 * would seriously skew the load calculation. However we'll make do for those
4455 * updates happening while idle (nohz_idle_balance) or coming out of idle
4456 * (tick_nohz_idle_exit).
4457 *
4458 * This means we might still be one tick off for nohz periods.
4459 */
4460
4461/*
4462 * Called from nohz_idle_balance() to update the load ratings before doing the
4463 * idle balance.
4464 */
4465static void update_idle_cpu_load(struct rq *this_rq)
4466{
4467 unsigned long curr_jiffies = READ_ONCE(jiffies);
4468 unsigned long load = this_rq->cfs.runnable_load_avg;
4469 unsigned long pending_updates;
4470
4471 /*
4472 * bail if there's load or we're actually up-to-date.
4473 */
4474 if (load || curr_jiffies == this_rq->last_load_update_tick)
4475 return;
4476
4477 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4478 this_rq->last_load_update_tick = curr_jiffies;
4479
4480 __update_cpu_load(this_rq, load, pending_updates);
4481}
4482
4483/*
4484 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4485 */
4486void update_cpu_load_nohz(void)
4487{
4488 struct rq *this_rq = this_rq();
4489 unsigned long curr_jiffies = READ_ONCE(jiffies);
4490 unsigned long pending_updates;
4491
4492 if (curr_jiffies == this_rq->last_load_update_tick)
4493 return;
4494
4495 raw_spin_lock(&this_rq->lock);
4496 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4497 if (pending_updates) {
4498 this_rq->last_load_update_tick = curr_jiffies;
4499 /*
4500 * We were idle, this means load 0, the current load might be
4501 * !0 due to remote wakeups and the sort.
4502 */
4503 __update_cpu_load(this_rq, 0, pending_updates);
4504 }
4505 raw_spin_unlock(&this_rq->lock);
4506}
4507#endif /* CONFIG_NO_HZ */
4508
4509/*
4510 * Called from scheduler_tick()
4511 */
4512void update_cpu_load_active(struct rq *this_rq)
4513{
4514 unsigned long load = this_rq->cfs.runnable_load_avg;
4515 /*
4516 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4517 */
4518 this_rq->last_load_update_tick = jiffies;
4519 __update_cpu_load(this_rq, load, 1);
4520}
4521
4326/* Used instead of source_load when we know the type == 0 */ 4522/* Used instead of source_load when we know the type == 0 */
4327static unsigned long weighted_cpuload(const int cpu) 4523static unsigned long weighted_cpuload(const int cpu)
4328{ 4524{
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu)
4375static unsigned long cpu_avg_load_per_task(int cpu) 4571static unsigned long cpu_avg_load_per_task(int cpu)
4376{ 4572{
4377 struct rq *rq = cpu_rq(cpu); 4573 struct rq *rq = cpu_rq(cpu);
4378 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); 4574 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4379 unsigned long load_avg = rq->cfs.runnable_load_avg; 4575 unsigned long load_avg = rq->cfs.runnable_load_avg;
4380 4576
4381 if (nr_running) 4577 if (nr_running)
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
5467} 5663}
5468 5664
5469#ifdef CONFIG_NUMA_BALANCING 5665#ifdef CONFIG_NUMA_BALANCING
5470/* Returns true if the destination node has incurred more faults */ 5666/*
5667 * Returns true if the destination node is the preferred node.
5668 * Needs to match fbq_classify_rq(): if there is a runnable task
5669 * that is not on its preferred node, we should identify it.
5670 */
5471static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5671static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5472{ 5672{
5473 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5673 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5674 unsigned long src_faults, dst_faults;
5474 int src_nid, dst_nid; 5675 int src_nid, dst_nid;
5475 5676
5476 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5677 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5484 if (src_nid == dst_nid) 5685 if (src_nid == dst_nid)
5485 return false; 5686 return false;
5486 5687
5487 if (numa_group) {
5488 /* Task is already in the group's interleave set. */
5489 if (node_isset(src_nid, numa_group->active_nodes))
5490 return false;
5491
5492 /* Task is moving into the group's interleave set. */
5493 if (node_isset(dst_nid, numa_group->active_nodes))
5494 return true;
5495
5496 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5497 }
5498
5499 /* Encourage migration to the preferred node. */ 5688 /* Encourage migration to the preferred node. */
5500 if (dst_nid == p->numa_preferred_nid) 5689 if (dst_nid == p->numa_preferred_nid)
5501 return true; 5690 return true;
5502 5691
5503 return task_faults(p, dst_nid) > task_faults(p, src_nid); 5692 /* Migrating away from the preferred node is bad. */
5693 if (src_nid == p->numa_preferred_nid)
5694 return false;
5695
5696 if (numa_group) {
5697 src_faults = group_faults(p, src_nid);
5698 dst_faults = group_faults(p, dst_nid);
5699 } else {
5700 src_faults = task_faults(p, src_nid);
5701 dst_faults = task_faults(p, dst_nid);
5702 }
5703
5704 return dst_faults > src_faults;
5504} 5705}
5505 5706
5506 5707
5507static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5708static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5508{ 5709{
5509 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5710 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5711 unsigned long src_faults, dst_faults;
5510 int src_nid, dst_nid; 5712 int src_nid, dst_nid;
5511 5713
5512 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5714 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5521 if (src_nid == dst_nid) 5723 if (src_nid == dst_nid)
5522 return false; 5724 return false;
5523 5725
5524 if (numa_group) { 5726 /* Migrating away from the preferred node is bad. */
5525 /* Task is moving within/into the group's interleave set. */ 5727 if (src_nid == p->numa_preferred_nid)
5526 if (node_isset(dst_nid, numa_group->active_nodes)) 5728 return true;
5527 return false;
5528 5729
5529 /* Task is moving out of the group's interleave set. */ 5730 /* Encourage migration to the preferred node. */
5530 if (node_isset(src_nid, numa_group->active_nodes)) 5731 if (dst_nid == p->numa_preferred_nid)
5531 return true; 5732 return false;
5532 5733
5533 return group_faults(p, dst_nid) < group_faults(p, src_nid); 5734 if (numa_group) {
5735 src_faults = group_faults(p, src_nid);
5736 dst_faults = group_faults(p, dst_nid);
5737 } else {
5738 src_faults = task_faults(p, src_nid);
5739 dst_faults = task_faults(p, dst_nid);
5534 } 5740 }
5535 5741
5536 /* Migrating away from the preferred node is always bad. */ 5742 return dst_faults < src_faults;
5537 if (src_nid == p->numa_preferred_nid)
5538 return true;
5539
5540 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5541} 5743}
5542 5744
5543#else 5745#else
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu)
6037 * Since we're reading these variables without serialization make sure 6239 * Since we're reading these variables without serialization make sure
6038 * we read them once before doing sanity checks on them. 6240 * we read them once before doing sanity checks on them.
6039 */ 6241 */
6040 age_stamp = ACCESS_ONCE(rq->age_stamp); 6242 age_stamp = READ_ONCE(rq->age_stamp);
6041 avg = ACCESS_ONCE(rq->rt_avg); 6243 avg = READ_ONCE(rq->rt_avg);
6042 delta = __rq_clock_broken(rq) - age_stamp; 6244 delta = __rq_clock_broken(rq) - age_stamp;
6043 6245
6044 if (unlikely(delta < 0)) 6246 if (unlikely(delta < 0))