summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-04 20:45:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-04 20:45:38 -0400
commitf7f4e7fc6c517708738d1d1984b170e9475a130f (patch)
tree9744eba2f74f1f19818d8a4ab8b8d65f865ddec8 /kernel/sched
parentd9b446e294f21a9616d36a786087466da64afe0a (diff)
parent2539fc82aa9b07d968cf9ba1ffeec3e0416ac721 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - power-aware scheduling improvements (Patrick Bellasi) - NUMA balancing improvements (Mel Gorman) - vCPU scheduling fixes (Rohit Jain) * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Update util_est before updating schedutil sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu() sched/wait: Include <linux/wait.h> in <linux/swait.h> sched/numa: Stagger NUMA balancing scan periods for new threads sched/core: Don't schedule threads on pre-empted vCPUs sched/fair: Avoid calling sync_entity_load_avg() unnecessarily sched/fair: Rearrange select_task_rq_fair() to optimize it
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c39
-rw-r--r--kernel/sched/cpufreq_schedutil.c17
-rw-r--r--kernel/sched/fair.c117
-rw-r--r--kernel/sched/sched.h6
4 files changed, 111 insertions, 68 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e27034bd954e..e9866f86f304 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2194 INIT_HLIST_HEAD(&p->preempt_notifiers); 2194 INIT_HLIST_HEAD(&p->preempt_notifiers);
2195#endif 2195#endif
2196 2196
2197#ifdef CONFIG_NUMA_BALANCING 2197 init_numa_balancing(clone_flags, p);
2198 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2199 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2200 p->mm->numa_scan_seq = 0;
2201 }
2202
2203 if (clone_flags & CLONE_VM)
2204 p->numa_preferred_nid = current->numa_preferred_nid;
2205 else
2206 p->numa_preferred_nid = -1;
2207
2208 p->node_stamp = 0ULL;
2209 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2210 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2211 p->numa_work.next = &p->numa_work;
2212 p->numa_faults = NULL;
2213 p->last_task_numa_placement = 0;
2214 p->last_sum_exec_runtime = 0;
2215
2216 p->numa_group = NULL;
2217#endif /* CONFIG_NUMA_BALANCING */
2218} 2198}
2219 2199
2220DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); 2200DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4050,6 +4030,23 @@ int idle_cpu(int cpu)
4050} 4030}
4051 4031
4052/** 4032/**
4033 * available_idle_cpu - is a given CPU idle for enqueuing work.
4034 * @cpu: the CPU in question.
4035 *
4036 * Return: 1 if the CPU is currently idle. 0 otherwise.
4037 */
4038int available_idle_cpu(int cpu)
4039{
4040 if (!idle_cpu(cpu))
4041 return 0;
4042
4043 if (vcpu_is_preempted(cpu))
4044 return 0;
4045
4046 return 1;
4047}
4048
4049/**
4053 * idle_task - return the idle task for a given CPU. 4050 * idle_task - return the idle task for a given CPU.
4054 * @cpu: the processor in question. 4051 * @cpu: the processor in question.
4055 * 4052 *
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e13df951aca7..28592b62b1d5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
183static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) 183static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
184{ 184{
185 struct rq *rq = cpu_rq(sg_cpu->cpu); 185 struct rq *rq = cpu_rq(sg_cpu->cpu);
186 unsigned long util;
187 186
188 if (rq->rt.rt_nr_running) { 187 if (rq->rt.rt_nr_running)
189 util = sg_cpu->max; 188 return sg_cpu->max;
190 } else {
191 util = sg_cpu->util_dl;
192 if (rq->cfs.h_nr_running)
193 util += sg_cpu->util_cfs;
194 }
195 189
196 /* 190 /*
191 * Utilization required by DEADLINE must always be granted while, for
192 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
193 * gracefully reduce the frequency when no tasks show up for longer
194 * periods of time.
195 *
197 * Ideally we would like to set util_dl as min/guaranteed freq and 196 * Ideally we would like to set util_dl as min/guaranteed freq and
198 * util_cfs + util_dl as requested freq. However, cpufreq is not yet 197 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
199 * ready for such an interface. So, we only do the latter for now. 198 * ready for such an interface. So, we only do the latter for now.
200 */ 199 */
201 return min(util, sg_cpu->max); 200 return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
202} 201}
203 202
204static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) 203static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574dba096..e497c05aab7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
1139 return max(smin, smax); 1139 return max(smin, smax);
1140} 1140}
1141 1141
1142void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1143{
1144 int mm_users = 0;
1145 struct mm_struct *mm = p->mm;
1146
1147 if (mm) {
1148 mm_users = atomic_read(&mm->mm_users);
1149 if (mm_users == 1) {
1150 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1151 mm->numa_scan_seq = 0;
1152 }
1153 }
1154 p->node_stamp = 0;
1155 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1156 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1157 p->numa_work.next = &p->numa_work;
1158 p->numa_faults = NULL;
1159 p->numa_group = NULL;
1160 p->last_task_numa_placement = 0;
1161 p->last_sum_exec_runtime = 0;
1162
1163 /* New address space, reset the preferred nid */
1164 if (!(clone_flags & CLONE_VM)) {
1165 p->numa_preferred_nid = -1;
1166 return;
1167 }
1168
1169 /*
1170 * New thread, keep existing numa_preferred_nid which should be copied
1171 * already by arch_dup_task_struct but stagger when scans start.
1172 */
1173 if (mm) {
1174 unsigned int delay;
1175
1176 delay = min_t(unsigned int, task_scan_max(current),
1177 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1178 delay += 2 * TICK_NSEC;
1179 p->node_stamp = delay;
1180 }
1181}
1182
1142static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 1183static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1143{ 1184{
1144 rq->nr_numa_running += (p->numa_preferred_nid != -1); 1185 rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5345 struct sched_entity *se = &p->se; 5386 struct sched_entity *se = &p->se;
5346 5387
5347 /* 5388 /*
5389 * The code below (indirectly) updates schedutil which looks at
5390 * the cfs_rq utilization to select a frequency.
5391 * Let's add the task's estimated utilization to the cfs_rq's
5392 * estimated utilization, before we update schedutil.
5393 */
5394 util_est_enqueue(&rq->cfs, p);
5395
5396 /*
5348 * If in_iowait is set, the code below may not trigger any cpufreq 5397 * If in_iowait is set, the code below may not trigger any cpufreq
5349 * utilization updates, so do it here explicitly with the IOWAIT flag 5398 * utilization updates, so do it here explicitly with the IOWAIT flag
5350 * passed. 5399 * passed.
@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5385 if (!se) 5434 if (!se)
5386 add_nr_running(rq, 1); 5435 add_nr_running(rq, 1);
5387 5436
5388 util_est_enqueue(&rq->cfs, p);
5389 hrtick_update(rq); 5437 hrtick_update(rq);
5390} 5438}
5391 5439
@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5858 * a cpufreq perspective, it's better to have higher utilisation 5906 * a cpufreq perspective, it's better to have higher utilisation
5859 * on one CPU. 5907 * on one CPU.
5860 */ 5908 */
5861 if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) 5909 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5862 return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; 5910 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5863 5911
5864 if (sync && cpu_rq(this_cpu)->nr_running == 1) 5912 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5865 return this_cpu; 5913 return this_cpu;
@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
6102 6150
6103 /* Traverse only the allowed CPUs */ 6151 /* Traverse only the allowed CPUs */
6104 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { 6152 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
6105 if (idle_cpu(i)) { 6153 if (available_idle_cpu(i)) {
6106 struct rq *rq = cpu_rq(i); 6154 struct rq *rq = cpu_rq(i);
6107 struct cpuidle_state *idle = idle_get_state(rq); 6155 struct cpuidle_state *idle = idle_get_state(rq);
6108 if (idle && idle->exit_latency < min_exit_latency) { 6156 if (idle && idle->exit_latency < min_exit_latency) {
@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6144 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) 6192 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6145 return prev_cpu; 6193 return prev_cpu;
6146 6194
6195 /*
6196 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
6197 * last_update_time.
6198 */
6199 if (!(sd_flag & SD_BALANCE_FORK))
6200 sync_entity_load_avg(&p->se);
6201
6147 while (sd) { 6202 while (sd) {
6148 struct sched_group *group; 6203 struct sched_group *group;
6149 struct sched_domain *tmp; 6204 struct sched_domain *tmp;
@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
6224 if (cpu == core) 6279 if (cpu == core)
6225 continue; 6280 continue;
6226 6281
6227 if (!idle_cpu(cpu)) 6282 if (!available_idle_cpu(cpu))
6228 goto unlock; 6283 goto unlock;
6229 } 6284 }
6230 6285
@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
6256 6311
6257 for_each_cpu(cpu, cpu_smt_mask(core)) { 6312 for_each_cpu(cpu, cpu_smt_mask(core)) {
6258 cpumask_clear_cpu(cpu, cpus); 6313 cpumask_clear_cpu(cpu, cpus);
6259 if (!idle_cpu(cpu)) 6314 if (!available_idle_cpu(cpu))
6260 idle = false; 6315 idle = false;
6261 } 6316 }
6262 6317
@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
6285 for_each_cpu(cpu, cpu_smt_mask(target)) { 6340 for_each_cpu(cpu, cpu_smt_mask(target)) {
6286 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6341 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6287 continue; 6342 continue;
6288 if (idle_cpu(cpu)) 6343 if (available_idle_cpu(cpu))
6289 return cpu; 6344 return cpu;
6290 } 6345 }
6291 6346
@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
6348 return -1; 6403 return -1;
6349 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6404 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6350 continue; 6405 continue;
6351 if (idle_cpu(cpu)) 6406 if (available_idle_cpu(cpu))
6352 break; 6407 break;
6353 } 6408 }
6354 6409
@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6368 struct sched_domain *sd; 6423 struct sched_domain *sd;
6369 int i, recent_used_cpu; 6424 int i, recent_used_cpu;
6370 6425
6371 if (idle_cpu(target)) 6426 if (available_idle_cpu(target))
6372 return target; 6427 return target;
6373 6428
6374 /* 6429 /*
6375 * If the previous CPU is cache affine and idle, don't be stupid: 6430 * If the previous CPU is cache affine and idle, don't be stupid:
6376 */ 6431 */
6377 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6432 if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6378 return prev; 6433 return prev;
6379 6434
6380 /* Check a recently used CPU as a potential idle candidate: */ 6435 /* Check a recently used CPU as a potential idle candidate: */
@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6382 if (recent_used_cpu != prev && 6437 if (recent_used_cpu != prev &&
6383 recent_used_cpu != target && 6438 recent_used_cpu != target &&
6384 cpus_share_cache(recent_used_cpu, target) && 6439 cpus_share_cache(recent_used_cpu, target) &&
6385 idle_cpu(recent_used_cpu) && 6440 available_idle_cpu(recent_used_cpu) &&
6386 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6441 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6387 /* 6442 /*
6388 * Replace recent_used_cpu with prev as it is a potential 6443 * Replace recent_used_cpu with prev as it is a potential
@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6558static int 6613static int
6559select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) 6614select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6560{ 6615{
6561 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 6616 struct sched_domain *tmp, *sd = NULL;
6562 int cpu = smp_processor_id(); 6617 int cpu = smp_processor_id();
6563 int new_cpu = prev_cpu; 6618 int new_cpu = prev_cpu;
6564 int want_affine = 0; 6619 int want_affine = 0;
@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6581 */ 6636 */
6582 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6637 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6583 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 6638 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6584 affine_sd = tmp; 6639 if (cpu != prev_cpu)
6640 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6641
6642 sd = NULL; /* Prefer wake_affine over balance flags */
6585 break; 6643 break;
6586 } 6644 }
6587 6645
@@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6591 break; 6649 break;
6592 } 6650 }
6593 6651
6594 if (affine_sd) { 6652 if (unlikely(sd)) {
6595 sd = NULL; /* Prefer wake_affine over balance flags */ 6653 /* Slow path */
6596 if (cpu == prev_cpu) 6654 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6597 goto pick_cpu; 6655 } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
6598 6656 /* Fast path */
6599 new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
6600 }
6601
6602 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6603 /*
6604 * We're going to need the task's util for capacity_spare_wake
6605 * in find_idlest_group. Sync it up to prev_cpu's
6606 * last_update_time.
6607 */
6608 sync_entity_load_avg(&p->se);
6609 }
6610 6657
6611 if (!sd) { 6658 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6612pick_cpu:
6613 if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
6614 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6615 6659
6616 if (want_affine) 6660 if (want_affine)
6617 current->recent_used_cpu = cpu; 6661 current->recent_used_cpu = cpu;
6618 }
6619 } else {
6620 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6621 } 6662 }
6622 rcu_read_unlock(); 6663 rcu_read_unlock();
6623 6664
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb467c221b15..6601baf2361c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,6 +1069,12 @@ enum numa_faults_stats {
1069extern void sched_setnuma(struct task_struct *p, int node); 1069extern void sched_setnuma(struct task_struct *p, int node);
1070extern int migrate_task_to(struct task_struct *p, int cpu); 1070extern int migrate_task_to(struct task_struct *p, int cpu);
1071extern int migrate_swap(struct task_struct *, struct task_struct *); 1071extern int migrate_swap(struct task_struct *, struct task_struct *);
1072extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
1073#else
1074static inline void
1075init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1076{
1077}
1072#endif /* CONFIG_NUMA_BALANCING */ 1078#endif /* CONFIG_NUMA_BALANCING */
1073 1079
1074#ifdef CONFIG_SMP 1080#ifdef CONFIG_SMP