diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-04 20:45:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-04 20:45:38 -0400 |
commit | f7f4e7fc6c517708738d1d1984b170e9475a130f (patch) | |
tree | 9744eba2f74f1f19818d8a4ab8b8d65f865ddec8 /kernel/sched | |
parent | d9b446e294f21a9616d36a786087466da64afe0a (diff) | |
parent | 2539fc82aa9b07d968cf9ba1ffeec3e0416ac721 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- power-aware scheduling improvements (Patrick Bellasi)
- NUMA balancing improvements (Mel Gorman)
- vCPU scheduling fixes (Rohit Jain)
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Update util_est before updating schedutil
sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization
sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation
sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu()
sched/wait: Include <linux/wait.h> in <linux/swait.h>
sched/numa: Stagger NUMA balancing scan periods for new threads
sched/core: Don't schedule threads on pre-empted vCPUs
sched/fair: Avoid calling sync_entity_load_avg() unnecessarily
sched/fair: Rearrange select_task_rq_fair() to optimize it
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 39 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 17 | ||||
-rw-r--r-- | kernel/sched/fair.c | 117 | ||||
-rw-r--r-- | kernel/sched/sched.h | 6 |
4 files changed, 111 insertions, 68 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e27034bd954e..e9866f86f304 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2194 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2194 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
2195 | #endif | 2195 | #endif |
2196 | 2196 | ||
2197 | #ifdef CONFIG_NUMA_BALANCING | 2197 | init_numa_balancing(clone_flags, p); |
2198 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | ||
2199 | p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); | ||
2200 | p->mm->numa_scan_seq = 0; | ||
2201 | } | ||
2202 | |||
2203 | if (clone_flags & CLONE_VM) | ||
2204 | p->numa_preferred_nid = current->numa_preferred_nid; | ||
2205 | else | ||
2206 | p->numa_preferred_nid = -1; | ||
2207 | |||
2208 | p->node_stamp = 0ULL; | ||
2209 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | ||
2210 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | ||
2211 | p->numa_work.next = &p->numa_work; | ||
2212 | p->numa_faults = NULL; | ||
2213 | p->last_task_numa_placement = 0; | ||
2214 | p->last_sum_exec_runtime = 0; | ||
2215 | |||
2216 | p->numa_group = NULL; | ||
2217 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2218 | } | 2198 | } |
2219 | 2199 | ||
2220 | DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); | 2200 | DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); |
@@ -4050,6 +4030,23 @@ int idle_cpu(int cpu) | |||
4050 | } | 4030 | } |
4051 | 4031 | ||
4052 | /** | 4032 | /** |
4033 | * available_idle_cpu - is a given CPU idle for enqueuing work. | ||
4034 | * @cpu: the CPU in question. | ||
4035 | * | ||
4036 | * Return: 1 if the CPU is currently idle. 0 otherwise. | ||
4037 | */ | ||
4038 | int available_idle_cpu(int cpu) | ||
4039 | { | ||
4040 | if (!idle_cpu(cpu)) | ||
4041 | return 0; | ||
4042 | |||
4043 | if (vcpu_is_preempted(cpu)) | ||
4044 | return 0; | ||
4045 | |||
4046 | return 1; | ||
4047 | } | ||
4048 | |||
4049 | /** | ||
4053 | * idle_task - return the idle task for a given CPU. | 4050 | * idle_task - return the idle task for a given CPU. |
4054 | * @cpu: the processor in question. | 4051 | * @cpu: the processor in question. |
4055 | * | 4052 | * |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e13df951aca7..28592b62b1d5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) | |||
183 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) | 183 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) |
184 | { | 184 | { |
185 | struct rq *rq = cpu_rq(sg_cpu->cpu); | 185 | struct rq *rq = cpu_rq(sg_cpu->cpu); |
186 | unsigned long util; | ||
187 | 186 | ||
188 | if (rq->rt.rt_nr_running) { | 187 | if (rq->rt.rt_nr_running) |
189 | util = sg_cpu->max; | 188 | return sg_cpu->max; |
190 | } else { | ||
191 | util = sg_cpu->util_dl; | ||
192 | if (rq->cfs.h_nr_running) | ||
193 | util += sg_cpu->util_cfs; | ||
194 | } | ||
195 | 189 | ||
196 | /* | 190 | /* |
191 | * Utilization required by DEADLINE must always be granted while, for | ||
192 | * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to | ||
193 | * gracefully reduce the frequency when no tasks show up for longer | ||
194 | * periods of time. | ||
195 | * | ||
197 | * Ideally we would like to set util_dl as min/guaranteed freq and | 196 | * Ideally we would like to set util_dl as min/guaranteed freq and |
198 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet | 197 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet |
199 | * ready for such an interface. So, we only do the latter for now. | 198 | * ready for such an interface. So, we only do the latter for now. |
200 | */ | 199 | */ |
201 | return min(util, sg_cpu->max); | 200 | return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); |
202 | } | 201 | } |
203 | 202 | ||
204 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) | 203 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79f574dba096..e497c05aab7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
1139 | return max(smin, smax); | 1139 | return max(smin, smax); |
1140 | } | 1140 | } |
1141 | 1141 | ||
1142 | void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) | ||
1143 | { | ||
1144 | int mm_users = 0; | ||
1145 | struct mm_struct *mm = p->mm; | ||
1146 | |||
1147 | if (mm) { | ||
1148 | mm_users = atomic_read(&mm->mm_users); | ||
1149 | if (mm_users == 1) { | ||
1150 | mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); | ||
1151 | mm->numa_scan_seq = 0; | ||
1152 | } | ||
1153 | } | ||
1154 | p->node_stamp = 0; | ||
1155 | p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; | ||
1156 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | ||
1157 | p->numa_work.next = &p->numa_work; | ||
1158 | p->numa_faults = NULL; | ||
1159 | p->numa_group = NULL; | ||
1160 | p->last_task_numa_placement = 0; | ||
1161 | p->last_sum_exec_runtime = 0; | ||
1162 | |||
1163 | /* New address space, reset the preferred nid */ | ||
1164 | if (!(clone_flags & CLONE_VM)) { | ||
1165 | p->numa_preferred_nid = -1; | ||
1166 | return; | ||
1167 | } | ||
1168 | |||
1169 | /* | ||
1170 | * New thread, keep existing numa_preferred_nid which should be copied | ||
1171 | * already by arch_dup_task_struct but stagger when scans start. | ||
1172 | */ | ||
1173 | if (mm) { | ||
1174 | unsigned int delay; | ||
1175 | |||
1176 | delay = min_t(unsigned int, task_scan_max(current), | ||
1177 | current->numa_scan_period * mm_users * NSEC_PER_MSEC); | ||
1178 | delay += 2 * TICK_NSEC; | ||
1179 | p->node_stamp = delay; | ||
1180 | } | ||
1181 | } | ||
1182 | |||
1142 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 1183 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
1143 | { | 1184 | { |
1144 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 1185 | rq->nr_numa_running += (p->numa_preferred_nid != -1); |
@@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5345 | struct sched_entity *se = &p->se; | 5386 | struct sched_entity *se = &p->se; |
5346 | 5387 | ||
5347 | /* | 5388 | /* |
5389 | * The code below (indirectly) updates schedutil which looks at | ||
5390 | * the cfs_rq utilization to select a frequency. | ||
5391 | * Let's add the task's estimated utilization to the cfs_rq's | ||
5392 | * estimated utilization, before we update schedutil. | ||
5393 | */ | ||
5394 | util_est_enqueue(&rq->cfs, p); | ||
5395 | |||
5396 | /* | ||
5348 | * If in_iowait is set, the code below may not trigger any cpufreq | 5397 | * If in_iowait is set, the code below may not trigger any cpufreq |
5349 | * utilization updates, so do it here explicitly with the IOWAIT flag | 5398 | * utilization updates, so do it here explicitly with the IOWAIT flag |
5350 | * passed. | 5399 | * passed. |
@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5385 | if (!se) | 5434 | if (!se) |
5386 | add_nr_running(rq, 1); | 5435 | add_nr_running(rq, 1); |
5387 | 5436 | ||
5388 | util_est_enqueue(&rq->cfs, p); | ||
5389 | hrtick_update(rq); | 5437 | hrtick_update(rq); |
5390 | } | 5438 | } |
5391 | 5439 | ||
@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) | |||
5858 | * a cpufreq perspective, it's better to have higher utilisation | 5906 | * a cpufreq perspective, it's better to have higher utilisation |
5859 | * on one CPU. | 5907 | * on one CPU. |
5860 | */ | 5908 | */ |
5861 | if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) | 5909 | if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) |
5862 | return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; | 5910 | return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; |
5863 | 5911 | ||
5864 | if (sync && cpu_rq(this_cpu)->nr_running == 1) | 5912 | if (sync && cpu_rq(this_cpu)->nr_running == 1) |
5865 | return this_cpu; | 5913 | return this_cpu; |
@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this | |||
6102 | 6150 | ||
6103 | /* Traverse only the allowed CPUs */ | 6151 | /* Traverse only the allowed CPUs */ |
6104 | for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { | 6152 | for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { |
6105 | if (idle_cpu(i)) { | 6153 | if (available_idle_cpu(i)) { |
6106 | struct rq *rq = cpu_rq(i); | 6154 | struct rq *rq = cpu_rq(i); |
6107 | struct cpuidle_state *idle = idle_get_state(rq); | 6155 | struct cpuidle_state *idle = idle_get_state(rq); |
6108 | if (idle && idle->exit_latency < min_exit_latency) { | 6156 | if (idle && idle->exit_latency < min_exit_latency) { |
@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
6144 | if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) | 6192 | if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) |
6145 | return prev_cpu; | 6193 | return prev_cpu; |
6146 | 6194 | ||
6195 | /* | ||
6196 | * We need task's util for capacity_spare_wake, sync it up to prev_cpu's | ||
6197 | * last_update_time. | ||
6198 | */ | ||
6199 | if (!(sd_flag & SD_BALANCE_FORK)) | ||
6200 | sync_entity_load_avg(&p->se); | ||
6201 | |||
6147 | while (sd) { | 6202 | while (sd) { |
6148 | struct sched_group *group; | 6203 | struct sched_group *group; |
6149 | struct sched_domain *tmp; | 6204 | struct sched_domain *tmp; |
@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq) | |||
6224 | if (cpu == core) | 6279 | if (cpu == core) |
6225 | continue; | 6280 | continue; |
6226 | 6281 | ||
6227 | if (!idle_cpu(cpu)) | 6282 | if (!available_idle_cpu(cpu)) |
6228 | goto unlock; | 6283 | goto unlock; |
6229 | } | 6284 | } |
6230 | 6285 | ||
@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | |||
6256 | 6311 | ||
6257 | for_each_cpu(cpu, cpu_smt_mask(core)) { | 6312 | for_each_cpu(cpu, cpu_smt_mask(core)) { |
6258 | cpumask_clear_cpu(cpu, cpus); | 6313 | cpumask_clear_cpu(cpu, cpus); |
6259 | if (!idle_cpu(cpu)) | 6314 | if (!available_idle_cpu(cpu)) |
6260 | idle = false; | 6315 | idle = false; |
6261 | } | 6316 | } |
6262 | 6317 | ||
@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t | |||
6285 | for_each_cpu(cpu, cpu_smt_mask(target)) { | 6340 | for_each_cpu(cpu, cpu_smt_mask(target)) { |
6286 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 6341 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) |
6287 | continue; | 6342 | continue; |
6288 | if (idle_cpu(cpu)) | 6343 | if (available_idle_cpu(cpu)) |
6289 | return cpu; | 6344 | return cpu; |
6290 | } | 6345 | } |
6291 | 6346 | ||
@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | |||
6348 | return -1; | 6403 | return -1; |
6349 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 6404 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) |
6350 | continue; | 6405 | continue; |
6351 | if (idle_cpu(cpu)) | 6406 | if (available_idle_cpu(cpu)) |
6352 | break; | 6407 | break; |
6353 | } | 6408 | } |
6354 | 6409 | ||
@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6368 | struct sched_domain *sd; | 6423 | struct sched_domain *sd; |
6369 | int i, recent_used_cpu; | 6424 | int i, recent_used_cpu; |
6370 | 6425 | ||
6371 | if (idle_cpu(target)) | 6426 | if (available_idle_cpu(target)) |
6372 | return target; | 6427 | return target; |
6373 | 6428 | ||
6374 | /* | 6429 | /* |
6375 | * If the previous CPU is cache affine and idle, don't be stupid: | 6430 | * If the previous CPU is cache affine and idle, don't be stupid: |
6376 | */ | 6431 | */ |
6377 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) | 6432 | if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) |
6378 | return prev; | 6433 | return prev; |
6379 | 6434 | ||
6380 | /* Check a recently used CPU as a potential idle candidate: */ | 6435 | /* Check a recently used CPU as a potential idle candidate: */ |
@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6382 | if (recent_used_cpu != prev && | 6437 | if (recent_used_cpu != prev && |
6383 | recent_used_cpu != target && | 6438 | recent_used_cpu != target && |
6384 | cpus_share_cache(recent_used_cpu, target) && | 6439 | cpus_share_cache(recent_used_cpu, target) && |
6385 | idle_cpu(recent_used_cpu) && | 6440 | available_idle_cpu(recent_used_cpu) && |
6386 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 6441 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { |
6387 | /* | 6442 | /* |
6388 | * Replace recent_used_cpu with prev as it is a potential | 6443 | * Replace recent_used_cpu with prev as it is a potential |
@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | |||
6558 | static int | 6613 | static int |
6559 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) | 6614 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) |
6560 | { | 6615 | { |
6561 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 6616 | struct sched_domain *tmp, *sd = NULL; |
6562 | int cpu = smp_processor_id(); | 6617 | int cpu = smp_processor_id(); |
6563 | int new_cpu = prev_cpu; | 6618 | int new_cpu = prev_cpu; |
6564 | int want_affine = 0; | 6619 | int want_affine = 0; |
@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6581 | */ | 6636 | */ |
6582 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 6637 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
6583 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { | 6638 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { |
6584 | affine_sd = tmp; | 6639 | if (cpu != prev_cpu) |
6640 | new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); | ||
6641 | |||
6642 | sd = NULL; /* Prefer wake_affine over balance flags */ | ||
6585 | break; | 6643 | break; |
6586 | } | 6644 | } |
6587 | 6645 | ||
@@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6591 | break; | 6649 | break; |
6592 | } | 6650 | } |
6593 | 6651 | ||
6594 | if (affine_sd) { | 6652 | if (unlikely(sd)) { |
6595 | sd = NULL; /* Prefer wake_affine over balance flags */ | 6653 | /* Slow path */ |
6596 | if (cpu == prev_cpu) | 6654 | new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); |
6597 | goto pick_cpu; | 6655 | } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ |
6598 | 6656 | /* Fast path */ | |
6599 | new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); | ||
6600 | } | ||
6601 | |||
6602 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { | ||
6603 | /* | ||
6604 | * We're going to need the task's util for capacity_spare_wake | ||
6605 | * in find_idlest_group. Sync it up to prev_cpu's | ||
6606 | * last_update_time. | ||
6607 | */ | ||
6608 | sync_entity_load_avg(&p->se); | ||
6609 | } | ||
6610 | 6657 | ||
6611 | if (!sd) { | 6658 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
6612 | pick_cpu: | ||
6613 | if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ | ||
6614 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); | ||
6615 | 6659 | ||
6616 | if (want_affine) | 6660 | if (want_affine) |
6617 | current->recent_used_cpu = cpu; | 6661 | current->recent_used_cpu = cpu; |
6618 | } | ||
6619 | } else { | ||
6620 | new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); | ||
6621 | } | 6662 | } |
6622 | rcu_read_unlock(); | 6663 | rcu_read_unlock(); |
6623 | 6664 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cb467c221b15..6601baf2361c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1069,6 +1069,12 @@ enum numa_faults_stats { | |||
1069 | extern void sched_setnuma(struct task_struct *p, int node); | 1069 | extern void sched_setnuma(struct task_struct *p, int node); |
1070 | extern int migrate_task_to(struct task_struct *p, int cpu); | 1070 | extern int migrate_task_to(struct task_struct *p, int cpu); |
1071 | extern int migrate_swap(struct task_struct *, struct task_struct *); | 1071 | extern int migrate_swap(struct task_struct *, struct task_struct *); |
1072 | extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); | ||
1073 | #else | ||
1074 | static inline void | ||
1075 | init_numa_balancing(unsigned long clone_flags, struct task_struct *p) | ||
1076 | { | ||
1077 | } | ||
1072 | #endif /* CONFIG_NUMA_BALANCING */ | 1078 | #endif /* CONFIG_NUMA_BALANCING */ |
1073 | 1079 | ||
1074 | #ifdef CONFIG_SMP | 1080 | #ifdef CONFIG_SMP |