diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-11-18 13:58:20 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-11-18 13:58:20 -0500 |
| commit | 03582f338e39ed8f8e8451ef1ef04f060d785a87 (patch) | |
| tree | 0594f22ee0f09197a060aecc9f2d76a34c02d921 | |
| parent | b53e27f618b58d50db72375eb8e1b6ddcef7cdb5 (diff) | |
| parent | c469933e772132aad040bd6a2adc8edf9ad6f825 (diff) | |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fix from Ingo Molnar:
"Fix an exec() related scalability/performance regression, which was
caused by incorrectly calculating load and migrating tasks on exec()
when they shouldn't be"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix cpu_util_wake() for 'execl' type workloads
| -rw-r--r-- | kernel/sched/fair.c | 62 |
1 files changed, 48 insertions, 14 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3648d0300fdf..ac855b2f4774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
| 5674 | return target; | 5674 | return target; |
| 5675 | } | 5675 | } |
| 5676 | 5676 | ||
| 5677 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); | 5677 | static unsigned long cpu_util_without(int cpu, struct task_struct *p); |
| 5678 | 5678 | ||
| 5679 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | 5679 | static unsigned long capacity_spare_without(int cpu, struct task_struct *p) |
| 5680 | { | 5680 | { |
| 5681 | return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); | 5681 | return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); |
| 5682 | } | 5682 | } |
| 5683 | 5683 | ||
| 5684 | /* | 5684 | /* |
| @@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5738 | 5738 | ||
| 5739 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); | 5739 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); |
| 5740 | 5740 | ||
| 5741 | spare_cap = capacity_spare_wake(i, p); | 5741 | spare_cap = capacity_spare_without(i, p); |
| 5742 | 5742 | ||
| 5743 | if (spare_cap > max_spare_cap) | 5743 | if (spare_cap > max_spare_cap) |
| 5744 | max_spare_cap = spare_cap; | 5744 | max_spare_cap = spare_cap; |
| @@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
| 5889 | return prev_cpu; | 5889 | return prev_cpu; |
| 5890 | 5890 | ||
| 5891 | /* | 5891 | /* |
| 5892 | * We need task's util for capacity_spare_wake, sync it up to prev_cpu's | 5892 | * We need task's util for capacity_spare_without, sync it up to |
| 5893 | * last_update_time. | 5893 | * prev_cpu's last_update_time. |
| 5894 | */ | 5894 | */ |
| 5895 | if (!(sd_flag & SD_BALANCE_FORK)) | 5895 | if (!(sd_flag & SD_BALANCE_FORK)) |
| 5896 | sync_entity_load_avg(&p->se); | 5896 | sync_entity_load_avg(&p->se); |
| @@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu) | |||
| 6216 | } | 6216 | } |
| 6217 | 6217 | ||
| 6218 | /* | 6218 | /* |
| 6219 | * cpu_util_wake: Compute CPU utilization with any contributions from | 6219 | * cpu_util_without: compute cpu utilization without any contributions from *p |
| 6220 | * the waking task p removed. | 6220 | * @cpu: the CPU which utilization is requested |
| 6221 | * @p: the task which utilization should be discounted | ||
| 6222 | * | ||
| 6223 | * The utilization of a CPU is defined by the utilization of tasks currently | ||
| 6224 | * enqueued on that CPU as well as tasks which are currently sleeping after an | ||
| 6225 | * execution on that CPU. | ||
| 6226 | * | ||
| 6227 | * This method returns the utilization of the specified CPU by discounting the | ||
| 6228 | * utilization of the specified task, whenever the task is currently | ||
| 6229 | * contributing to the CPU utilization. | ||
| 6221 | */ | 6230 | */ |
| 6222 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | 6231 | static unsigned long cpu_util_without(int cpu, struct task_struct *p) |
| 6223 | { | 6232 | { |
| 6224 | struct cfs_rq *cfs_rq; | 6233 | struct cfs_rq *cfs_rq; |
| 6225 | unsigned int util; | 6234 | unsigned int util; |
| @@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | |||
| 6231 | cfs_rq = &cpu_rq(cpu)->cfs; | 6240 | cfs_rq = &cpu_rq(cpu)->cfs; |
| 6232 | util = READ_ONCE(cfs_rq->avg.util_avg); | 6241 | util = READ_ONCE(cfs_rq->avg.util_avg); |
| 6233 | 6242 | ||
| 6234 | /* Discount task's blocked util from CPU's util */ | 6243 | /* Discount task's util from CPU's util */ |
| 6235 | util -= min_t(unsigned int, util, task_util(p)); | 6244 | util -= min_t(unsigned int, util, task_util(p)); |
| 6236 | 6245 | ||
| 6237 | /* | 6246 | /* |
| @@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | |||
| 6240 | * a) if *p is the only task sleeping on this CPU, then: | 6249 | * a) if *p is the only task sleeping on this CPU, then: |
| 6241 | * cpu_util (== task_util) > util_est (== 0) | 6250 | * cpu_util (== task_util) > util_est (== 0) |
| 6242 | * and thus we return: | 6251 | * and thus we return: |
| 6243 | * cpu_util_wake = (cpu_util - task_util) = 0 | 6252 | * cpu_util_without = (cpu_util - task_util) = 0 |
| 6244 | * | 6253 | * |
| 6245 | * b) if other tasks are SLEEPING on this CPU, which is now exiting | 6254 | * b) if other tasks are SLEEPING on this CPU, which is now exiting |
| 6246 | * IDLE, then: | 6255 | * IDLE, then: |
| 6247 | * cpu_util >= task_util | 6256 | * cpu_util >= task_util |
| 6248 | * cpu_util > util_est (== 0) | 6257 | * cpu_util > util_est (== 0) |
| 6249 | * and thus we discount *p's blocked utilization to return: | 6258 | * and thus we discount *p's blocked utilization to return: |
| 6250 | * cpu_util_wake = (cpu_util - task_util) >= 0 | 6259 | * cpu_util_without = (cpu_util - task_util) >= 0 |
| 6251 | * | 6260 | * |
| 6252 | * c) if other tasks are RUNNABLE on that CPU and | 6261 | * c) if other tasks are RUNNABLE on that CPU and |
| 6253 | * util_est > cpu_util | 6262 | * util_est > cpu_util |
| @@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | |||
| 6260 | * covered by the following code when estimated utilization is | 6269 | * covered by the following code when estimated utilization is |
| 6261 | * enabled. | 6270 | * enabled. |
| 6262 | */ | 6271 | */ |
| 6263 | if (sched_feat(UTIL_EST)) | 6272 | if (sched_feat(UTIL_EST)) { |
| 6264 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); | 6273 | unsigned int estimated = |
| 6274 | READ_ONCE(cfs_rq->avg.util_est.enqueued); | ||
| 6275 | |||
| 6276 | /* | ||
| 6277 | * Despite the following checks we still have a small window | ||
| 6278 | * for a possible race, when an execl's select_task_rq_fair() | ||
| 6279 | * races with LB's detach_task(): | ||
| 6280 | * | ||
| 6281 | * detach_task() | ||
| 6282 | * p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 6283 | * ---------------------------------- A | ||
| 6284 | * deactivate_task() \ | ||
| 6285 | * dequeue_task() + RaceTime | ||
| 6286 | * util_est_dequeue() / | ||
| 6287 | * ---------------------------------- B | ||
| 6288 | * | ||
| 6289 | * The additional check on "current == p" it's required to | ||
| 6290 | * properly fix the execl regression and it helps in further | ||
| 6291 | * reducing the chances for the above race. | ||
| 6292 | */ | ||
| 6293 | if (unlikely(task_on_rq_queued(p) || current == p)) { | ||
| 6294 | estimated -= min_t(unsigned int, estimated, | ||
| 6295 | (_task_util_est(p) | UTIL_AVG_UNCHANGED)); | ||
| 6296 | } | ||
| 6297 | util = max(util, estimated); | ||
| 6298 | } | ||
| 6265 | 6299 | ||
| 6266 | /* | 6300 | /* |
| 6267 | * Utilization (estimated) can exceed the CPU capacity, thus let's | 6301 | * Utilization (estimated) can exceed the CPU capacity, thus let's |
