diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 276 |
1 files changed, 239 insertions, 37 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..0d4632f7799b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) | |||
141 | * | 141 | * |
142 | * This idea comes from the SD scheduler of Con Kolivas: | 142 | * This idea comes from the SD scheduler of Con Kolivas: |
143 | */ | 143 | */ |
144 | static int get_update_sysctl_factor(void) | 144 | static unsigned int get_update_sysctl_factor(void) |
145 | { | 145 | { |
146 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | 146 | unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); |
147 | unsigned int factor; | 147 | unsigned int factor; |
148 | 148 | ||
149 | switch (sysctl_sched_tunable_scaling) { | 149 | switch (sysctl_sched_tunable_scaling) { |
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
576 | loff_t *ppos) | 576 | loff_t *ppos) |
577 | { | 577 | { |
578 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 578 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
579 | int factor = get_update_sysctl_factor(); | 579 | unsigned int factor = get_update_sysctl_factor(); |
580 | 580 | ||
581 | if (ret || !write) | 581 | if (ret || !write) |
582 | return ret; | 582 | return ret; |
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) | |||
834 | 834 | ||
835 | static unsigned int task_scan_min(struct task_struct *p) | 835 | static unsigned int task_scan_min(struct task_struct *p) |
836 | { | 836 | { |
837 | unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); | 837 | unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); |
838 | unsigned int scan, floor; | 838 | unsigned int scan, floor; |
839 | unsigned int windows = 1; | 839 | unsigned int windows = 1; |
840 | 840 | ||
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p) | |||
1794 | u64 runtime, period; | 1794 | u64 runtime, period; |
1795 | spinlock_t *group_lock = NULL; | 1795 | spinlock_t *group_lock = NULL; |
1796 | 1796 | ||
1797 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1797 | /* |
1798 | * The p->mm->numa_scan_seq field gets updated without | ||
1799 | * exclusive access. Use READ_ONCE() here to ensure | ||
1800 | * that the field is read in a single access: | ||
1801 | */ | ||
1802 | seq = READ_ONCE(p->mm->numa_scan_seq); | ||
1798 | if (p->numa_scan_seq == seq) | 1803 | if (p->numa_scan_seq == seq) |
1799 | return; | 1804 | return; |
1800 | p->numa_scan_seq = seq; | 1805 | p->numa_scan_seq = seq; |
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1938 | } | 1943 | } |
1939 | 1944 | ||
1940 | rcu_read_lock(); | 1945 | rcu_read_lock(); |
1941 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | 1946 | tsk = READ_ONCE(cpu_rq(cpu)->curr); |
1942 | 1947 | ||
1943 | if (!cpupid_match_pid(tsk, cpupid)) | 1948 | if (!cpupid_match_pid(tsk, cpupid)) |
1944 | goto no_join; | 1949 | goto no_join; |
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2107 | 2112 | ||
2108 | static void reset_ptenuma_scan(struct task_struct *p) | 2113 | static void reset_ptenuma_scan(struct task_struct *p) |
2109 | { | 2114 | { |
2110 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | 2115 | /* |
2116 | * We only did a read acquisition of the mmap sem, so | ||
2117 | * p->mm->numa_scan_seq is written to without exclusive access | ||
2118 | * and the update is not guaranteed to be atomic. That's not | ||
2119 | * much of an issue though, since this is just used for | ||
2120 | * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not | ||
2121 | * expensive, to avoid any form of compiler optimizations: | ||
2122 | */ | ||
2123 | WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); | ||
2111 | p->mm->numa_scan_offset = 0; | 2124 | p->mm->numa_scan_offset = 0; |
2112 | } | 2125 | } |
2113 | 2126 | ||
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4323 | } | 4336 | } |
4324 | 4337 | ||
4325 | #ifdef CONFIG_SMP | 4338 | #ifdef CONFIG_SMP |
4339 | |||
4340 | /* | ||
4341 | * per rq 'load' arrray crap; XXX kill this. | ||
4342 | */ | ||
4343 | |||
4344 | /* | ||
4345 | * The exact cpuload at various idx values, calculated at every tick would be | ||
4346 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
4347 | * | ||
4348 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
4349 | * on nth tick when cpu may be busy, then we have: | ||
4350 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
4351 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
4352 | * | ||
4353 | * decay_load_missed() below does efficient calculation of | ||
4354 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
4355 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
4356 | * | ||
4357 | * The calculation is approximated on a 128 point scale. | ||
4358 | * degrade_zero_ticks is the number of ticks after which load at any | ||
4359 | * particular idx is approximated to be zero. | ||
4360 | * degrade_factor is a precomputed table, a row for each load idx. | ||
4361 | * Each column corresponds to degradation factor for a power of two ticks, | ||
4362 | * based on 128 point scale. | ||
4363 | * Example: | ||
4364 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
4365 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
4366 | * | ||
4367 | * With this power of 2 load factors, we can degrade the load n times | ||
4368 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
4369 | * n mult/shifts needed by the exact degradation. | ||
4370 | */ | ||
4371 | #define DEGRADE_SHIFT 7 | ||
4372 | static const unsigned char | ||
4373 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
4374 | static const unsigned char | ||
4375 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
4376 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
4377 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
4378 | {96, 72, 40, 12, 1, 0, 0}, | ||
4379 | {112, 98, 75, 43, 15, 1, 0}, | ||
4380 | {120, 112, 98, 76, 45, 16, 2} }; | ||
4381 | |||
4382 | /* | ||
4383 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
4384 | * would be when CPU is idle and so we just decay the old load without | ||
4385 | * adding any new load. | ||
4386 | */ | ||
4387 | static unsigned long | ||
4388 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
4389 | { | ||
4390 | int j = 0; | ||
4391 | |||
4392 | if (!missed_updates) | ||
4393 | return load; | ||
4394 | |||
4395 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
4396 | return 0; | ||
4397 | |||
4398 | if (idx == 1) | ||
4399 | return load >> missed_updates; | ||
4400 | |||
4401 | while (missed_updates) { | ||
4402 | if (missed_updates % 2) | ||
4403 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
4404 | |||
4405 | missed_updates >>= 1; | ||
4406 | j++; | ||
4407 | } | ||
4408 | return load; | ||
4409 | } | ||
4410 | |||
4411 | /* | ||
4412 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
4413 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
4414 | * every tick. We fix it up based on jiffies. | ||
4415 | */ | ||
4416 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
4417 | unsigned long pending_updates) | ||
4418 | { | ||
4419 | int i, scale; | ||
4420 | |||
4421 | this_rq->nr_load_updates++; | ||
4422 | |||
4423 | /* Update our load: */ | ||
4424 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
4425 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
4426 | unsigned long old_load, new_load; | ||
4427 | |||
4428 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
4429 | |||
4430 | old_load = this_rq->cpu_load[i]; | ||
4431 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
4432 | new_load = this_load; | ||
4433 | /* | ||
4434 | * Round up the averaging division if load is increasing. This | ||
4435 | * prevents us from getting stuck on 9 if the load is 10, for | ||
4436 | * example. | ||
4437 | */ | ||
4438 | if (new_load > old_load) | ||
4439 | new_load += scale - 1; | ||
4440 | |||
4441 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
4442 | } | ||
4443 | |||
4444 | sched_avg_update(this_rq); | ||
4445 | } | ||
4446 | |||
4447 | #ifdef CONFIG_NO_HZ_COMMON | ||
4448 | /* | ||
4449 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
4450 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
4451 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
4452 | * | ||
4453 | * Therefore we cannot use the delta approach from the regular tick since that | ||
4454 | * would seriously skew the load calculation. However we'll make do for those | ||
4455 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
4456 | * (tick_nohz_idle_exit). | ||
4457 | * | ||
4458 | * This means we might still be one tick off for nohz periods. | ||
4459 | */ | ||
4460 | |||
4461 | /* | ||
4462 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
4463 | * idle balance. | ||
4464 | */ | ||
4465 | static void update_idle_cpu_load(struct rq *this_rq) | ||
4466 | { | ||
4467 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
4468 | unsigned long load = this_rq->cfs.runnable_load_avg; | ||
4469 | unsigned long pending_updates; | ||
4470 | |||
4471 | /* | ||
4472 | * bail if there's load or we're actually up-to-date. | ||
4473 | */ | ||
4474 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
4475 | return; | ||
4476 | |||
4477 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
4478 | this_rq->last_load_update_tick = curr_jiffies; | ||
4479 | |||
4480 | __update_cpu_load(this_rq, load, pending_updates); | ||
4481 | } | ||
4482 | |||
4483 | /* | ||
4484 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
4485 | */ | ||
4486 | void update_cpu_load_nohz(void) | ||
4487 | { | ||
4488 | struct rq *this_rq = this_rq(); | ||
4489 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
4490 | unsigned long pending_updates; | ||
4491 | |||
4492 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
4493 | return; | ||
4494 | |||
4495 | raw_spin_lock(&this_rq->lock); | ||
4496 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
4497 | if (pending_updates) { | ||
4498 | this_rq->last_load_update_tick = curr_jiffies; | ||
4499 | /* | ||
4500 | * We were idle, this means load 0, the current load might be | ||
4501 | * !0 due to remote wakeups and the sort. | ||
4502 | */ | ||
4503 | __update_cpu_load(this_rq, 0, pending_updates); | ||
4504 | } | ||
4505 | raw_spin_unlock(&this_rq->lock); | ||
4506 | } | ||
4507 | #endif /* CONFIG_NO_HZ */ | ||
4508 | |||
4509 | /* | ||
4510 | * Called from scheduler_tick() | ||
4511 | */ | ||
4512 | void update_cpu_load_active(struct rq *this_rq) | ||
4513 | { | ||
4514 | unsigned long load = this_rq->cfs.runnable_load_avg; | ||
4515 | /* | ||
4516 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
4517 | */ | ||
4518 | this_rq->last_load_update_tick = jiffies; | ||
4519 | __update_cpu_load(this_rq, load, 1); | ||
4520 | } | ||
4521 | |||
4326 | /* Used instead of source_load when we know the type == 0 */ | 4522 | /* Used instead of source_load when we know the type == 0 */ |
4327 | static unsigned long weighted_cpuload(const int cpu) | 4523 | static unsigned long weighted_cpuload(const int cpu) |
4328 | { | 4524 | { |
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu) | |||
4375 | static unsigned long cpu_avg_load_per_task(int cpu) | 4571 | static unsigned long cpu_avg_load_per_task(int cpu) |
4376 | { | 4572 | { |
4377 | struct rq *rq = cpu_rq(cpu); | 4573 | struct rq *rq = cpu_rq(cpu); |
4378 | unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); | 4574 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); |
4379 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4575 | unsigned long load_avg = rq->cfs.runnable_load_avg; |
4380 | 4576 | ||
4381 | if (nr_running) | 4577 | if (nr_running) |
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env) | |||
5467 | } | 5663 | } |
5468 | 5664 | ||
5469 | #ifdef CONFIG_NUMA_BALANCING | 5665 | #ifdef CONFIG_NUMA_BALANCING |
5470 | /* Returns true if the destination node has incurred more faults */ | 5666 | /* |
5667 | * Returns true if the destination node is the preferred node. | ||
5668 | * Needs to match fbq_classify_rq(): if there is a runnable task | ||
5669 | * that is not on its preferred node, we should identify it. | ||
5670 | */ | ||
5471 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | 5671 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) |
5472 | { | 5672 | { |
5473 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5673 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
5674 | unsigned long src_faults, dst_faults; | ||
5474 | int src_nid, dst_nid; | 5675 | int src_nid, dst_nid; |
5475 | 5676 | ||
5476 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | 5677 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || |
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
5484 | if (src_nid == dst_nid) | 5685 | if (src_nid == dst_nid) |
5485 | return false; | 5686 | return false; |
5486 | 5687 | ||
5487 | if (numa_group) { | ||
5488 | /* Task is already in the group's interleave set. */ | ||
5489 | if (node_isset(src_nid, numa_group->active_nodes)) | ||
5490 | return false; | ||
5491 | |||
5492 | /* Task is moving into the group's interleave set. */ | ||
5493 | if (node_isset(dst_nid, numa_group->active_nodes)) | ||
5494 | return true; | ||
5495 | |||
5496 | return group_faults(p, dst_nid) > group_faults(p, src_nid); | ||
5497 | } | ||
5498 | |||
5499 | /* Encourage migration to the preferred node. */ | 5688 | /* Encourage migration to the preferred node. */ |
5500 | if (dst_nid == p->numa_preferred_nid) | 5689 | if (dst_nid == p->numa_preferred_nid) |
5501 | return true; | 5690 | return true; |
5502 | 5691 | ||
5503 | return task_faults(p, dst_nid) > task_faults(p, src_nid); | 5692 | /* Migrating away from the preferred node is bad. */ |
5693 | if (src_nid == p->numa_preferred_nid) | ||
5694 | return false; | ||
5695 | |||
5696 | if (numa_group) { | ||
5697 | src_faults = group_faults(p, src_nid); | ||
5698 | dst_faults = group_faults(p, dst_nid); | ||
5699 | } else { | ||
5700 | src_faults = task_faults(p, src_nid); | ||
5701 | dst_faults = task_faults(p, dst_nid); | ||
5702 | } | ||
5703 | |||
5704 | return dst_faults > src_faults; | ||
5504 | } | 5705 | } |
5505 | 5706 | ||
5506 | 5707 | ||
5507 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | 5708 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) |
5508 | { | 5709 | { |
5509 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5710 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
5711 | unsigned long src_faults, dst_faults; | ||
5510 | int src_nid, dst_nid; | 5712 | int src_nid, dst_nid; |
5511 | 5713 | ||
5512 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5714 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5521 | if (src_nid == dst_nid) | 5723 | if (src_nid == dst_nid) |
5522 | return false; | 5724 | return false; |
5523 | 5725 | ||
5524 | if (numa_group) { | 5726 | /* Migrating away from the preferred node is bad. */ |
5525 | /* Task is moving within/into the group's interleave set. */ | 5727 | if (src_nid == p->numa_preferred_nid) |
5526 | if (node_isset(dst_nid, numa_group->active_nodes)) | 5728 | return true; |
5527 | return false; | ||
5528 | 5729 | ||
5529 | /* Task is moving out of the group's interleave set. */ | 5730 | /* Encourage migration to the preferred node. */ |
5530 | if (node_isset(src_nid, numa_group->active_nodes)) | 5731 | if (dst_nid == p->numa_preferred_nid) |
5531 | return true; | 5732 | return false; |
5532 | 5733 | ||
5533 | return group_faults(p, dst_nid) < group_faults(p, src_nid); | 5734 | if (numa_group) { |
5735 | src_faults = group_faults(p, src_nid); | ||
5736 | dst_faults = group_faults(p, dst_nid); | ||
5737 | } else { | ||
5738 | src_faults = task_faults(p, src_nid); | ||
5739 | dst_faults = task_faults(p, dst_nid); | ||
5534 | } | 5740 | } |
5535 | 5741 | ||
5536 | /* Migrating away from the preferred node is always bad. */ | 5742 | return dst_faults < src_faults; |
5537 | if (src_nid == p->numa_preferred_nid) | ||
5538 | return true; | ||
5539 | |||
5540 | return task_faults(p, dst_nid) < task_faults(p, src_nid); | ||
5541 | } | 5743 | } |
5542 | 5744 | ||
5543 | #else | 5745 | #else |
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu) | |||
6037 | * Since we're reading these variables without serialization make sure | 6239 | * Since we're reading these variables without serialization make sure |
6038 | * we read them once before doing sanity checks on them. | 6240 | * we read them once before doing sanity checks on them. |
6039 | */ | 6241 | */ |
6040 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 6242 | age_stamp = READ_ONCE(rq->age_stamp); |
6041 | avg = ACCESS_ONCE(rq->rt_avg); | 6243 | avg = READ_ONCE(rq->rt_avg); |
6042 | delta = __rq_clock_broken(rq) - age_stamp; | 6244 | delta = __rq_clock_broken(rq) - age_stamp; |
6043 | 6245 | ||
6044 | if (unlikely(delta < 0)) | 6246 | if (unlikely(delta < 0)) |