aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-03-17 16:19:07 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-03-17 16:19:07 -0400
commitcd21debe5318842a0bbd38c0327cfde2a3b90d65 (patch)
tree5d596c9510eec117ff580ff0aa33f6de028d002b /kernel
parentb5f13082b19dc09378660226011ebfb033358ea6 (diff)
parent2317d5f1c34913bac5971d93d69fb6c31bb74670 (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: "From the scheduler departement: - a bunch of sched deadline related fixes which deal with various buglets and corner cases. - two fixes for the loadavg spikes which are caused by the delayed NOHZ accounting" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Use deadline instead of period when calculating overflow sched/deadline: Throttle a constrained deadline task activated after the deadline sched/deadline: Make sure the replenishment timer fires in the next period sched/loadavg: Use {READ,WRITE}_ONCE() for sample window sched/loadavg: Avoid loadavg spikes caused by delayed NO_HZ accounting sched/deadline: Add missing update_rq_clock() in dl_task_timer()
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/deadline.c63
-rw-r--r--kernel/sched/loadavg.c20
2 files changed, 69 insertions, 14 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 99b2c33a9fbc..a2ce59015642 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
445 * 445 *
446 * This function returns true if: 446 * This function returns true if:
447 * 447 *
448 * runtime / (deadline - t) > dl_runtime / dl_period , 448 * runtime / (deadline - t) > dl_runtime / dl_deadline ,
449 * 449 *
450 * IOW we can't recycle current parameters. 450 * IOW we can't recycle current parameters.
451 * 451 *
452 * Notice that the bandwidth check is done against the period. For 452 * Notice that the bandwidth check is done against the deadline. For
453 * task with deadline equal to period this is the same of using 453 * task with deadline equal to period this is the same of using
454 * dl_deadline instead of dl_period in the equation above. 454 * dl_period instead of dl_deadline in the equation above.
455 */ 455 */
456static bool dl_entity_overflow(struct sched_dl_entity *dl_se, 456static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
457 struct sched_dl_entity *pi_se, u64 t) 457 struct sched_dl_entity *pi_se, u64 t)
@@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
476 * of anything below microseconds resolution is actually fiction 476 * of anything below microseconds resolution is actually fiction
477 * (but still we want to give the user that illusion >;). 477 * (but still we want to give the user that illusion >;).
478 */ 478 */
479 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); 479 left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
480 right = ((dl_se->deadline - t) >> DL_SCALE) * 480 right = ((dl_se->deadline - t) >> DL_SCALE) *
481 (pi_se->dl_runtime >> DL_SCALE); 481 (pi_se->dl_runtime >> DL_SCALE);
482 482
@@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
505 } 505 }
506} 506}
507 507
508static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
509{
510 return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
511}
512
508/* 513/*
509 * If the entity depleted all its runtime, and if we want it to sleep 514 * If the entity depleted all its runtime, and if we want it to sleep
510 * while waiting for some new execution time to become available, we 515 * while waiting for some new execution time to become available, we
511 * set the bandwidth enforcement timer to the replenishment instant 516 * set the bandwidth replenishment timer to the replenishment instant
512 * and try to activate it. 517 * and try to activate it.
513 * 518 *
514 * Notice that it is important for the caller to know if the timer 519 * Notice that it is important for the caller to know if the timer
@@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)
530 * that it is actually coming from rq->clock and not from 535 * that it is actually coming from rq->clock and not from
531 * hrtimer's time base reading. 536 * hrtimer's time base reading.
532 */ 537 */
533 act = ns_to_ktime(dl_se->deadline); 538 act = ns_to_ktime(dl_next_period(dl_se));
534 now = hrtimer_cb_get_time(timer); 539 now = hrtimer_cb_get_time(timer);
535 delta = ktime_to_ns(now) - rq_clock(rq); 540 delta = ktime_to_ns(now) - rq_clock(rq);
536 act = ktime_add_ns(act, delta); 541 act = ktime_add_ns(act, delta);
@@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
638 lockdep_unpin_lock(&rq->lock, rf.cookie); 643 lockdep_unpin_lock(&rq->lock, rf.cookie);
639 rq = dl_task_offline_migration(rq, p); 644 rq = dl_task_offline_migration(rq, p);
640 rf.cookie = lockdep_pin_lock(&rq->lock); 645 rf.cookie = lockdep_pin_lock(&rq->lock);
646 update_rq_clock(rq);
641 647
642 /* 648 /*
643 * Now that the task has been migrated to the new RQ and we 649 * Now that the task has been migrated to the new RQ and we
@@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
689 timer->function = dl_task_timer; 695 timer->function = dl_task_timer;
690} 696}
691 697
698/*
699 * During the activation, CBS checks if it can reuse the current task's
700 * runtime and period. If the deadline of the task is in the past, CBS
701 * cannot use the runtime, and so it replenishes the task. This rule
702 * works fine for implicit deadline tasks (deadline == period), and the
703 * CBS was designed for implicit deadline tasks. However, a task with
704 * constrained deadline (deadine < period) might be awakened after the
705 * deadline, but before the next period. In this case, replenishing the
706 * task would allow it to run for runtime / deadline. As in this case
707 * deadline < period, CBS enables a task to run for more than the
708 * runtime / period. In a very loaded system, this can cause a domino
709 * effect, making other tasks miss their deadlines.
710 *
711 * To avoid this problem, in the activation of a constrained deadline
712 * task after the deadline but before the next period, throttle the
713 * task and set the replenishing timer to the begin of the next period,
714 * unless it is boosted.
715 */
716static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
717{
718 struct task_struct *p = dl_task_of(dl_se);
719 struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
720
721 if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
722 dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
723 if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
724 return;
725 dl_se->dl_throttled = 1;
726 }
727}
728
692static 729static
693int dl_runtime_exceeded(struct sched_dl_entity *dl_se) 730int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
694{ 731{
@@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
922 __dequeue_dl_entity(dl_se); 959 __dequeue_dl_entity(dl_se);
923} 960}
924 961
962static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
963{
964 return dl_se->dl_deadline < dl_se->dl_period;
965}
966
925static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) 967static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
926{ 968{
927 struct task_struct *pi_task = rt_mutex_get_top_task(p); 969 struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
948 } 990 }
949 991
950 /* 992 /*
993 * Check if a constrained deadline task was activated
994 * after the deadline but before the next period.
995 * If that is the case, the task will be throttled and
996 * the replenishment timer will be set to the next period.
997 */
998 if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
999 dl_check_constrained_dl(&p->dl);
1000
1001 /*
951 * If p is throttled, we do nothing. In fact, if it exhausted 1002 * If p is throttled, we do nothing. In fact, if it exhausted
952 * its budget it needs a replenishment and, since it now is on 1003 * its budget it needs a replenishment and, since it now is on
953 * its rq, the bandwidth timer callback (which clearly has not 1004 * its rq, the bandwidth timer callback (which clearly has not
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 7296b7308eca..f15fb2bdbc0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -169,7 +169,7 @@ static inline int calc_load_write_idx(void)
169 * If the folding window started, make sure we start writing in the 169 * If the folding window started, make sure we start writing in the
170 * next idle-delta. 170 * next idle-delta.
171 */ 171 */
172 if (!time_before(jiffies, calc_load_update)) 172 if (!time_before(jiffies, READ_ONCE(calc_load_update)))
173 idx++; 173 idx++;
174 174
175 return idx & 1; 175 return idx & 1;
@@ -202,8 +202,9 @@ void calc_load_exit_idle(void)
202 struct rq *this_rq = this_rq(); 202 struct rq *this_rq = this_rq();
203 203
204 /* 204 /*
205 * If we're still before the sample window, we're done. 205 * If we're still before the pending sample window, we're done.
206 */ 206 */
207 this_rq->calc_load_update = READ_ONCE(calc_load_update);
207 if (time_before(jiffies, this_rq->calc_load_update)) 208 if (time_before(jiffies, this_rq->calc_load_update))
208 return; 209 return;
209 210
@@ -212,7 +213,6 @@ void calc_load_exit_idle(void)
212 * accounted through the nohz accounting, so skip the entire deal and 213 * accounted through the nohz accounting, so skip the entire deal and
213 * sync up for the next window. 214 * sync up for the next window.
214 */ 215 */
215 this_rq->calc_load_update = calc_load_update;
216 if (time_before(jiffies, this_rq->calc_load_update + 10)) 216 if (time_before(jiffies, this_rq->calc_load_update + 10))
217 this_rq->calc_load_update += LOAD_FREQ; 217 this_rq->calc_load_update += LOAD_FREQ;
218} 218}
@@ -308,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp,
308 */ 308 */
309static void calc_global_nohz(void) 309static void calc_global_nohz(void)
310{ 310{
311 unsigned long sample_window;
311 long delta, active, n; 312 long delta, active, n;
312 313
313 if (!time_before(jiffies, calc_load_update + 10)) { 314 sample_window = READ_ONCE(calc_load_update);
315 if (!time_before(jiffies, sample_window + 10)) {
314 /* 316 /*
315 * Catch-up, fold however many we are behind still 317 * Catch-up, fold however many we are behind still
316 */ 318 */
317 delta = jiffies - calc_load_update - 10; 319 delta = jiffies - sample_window - 10;
318 n = 1 + (delta / LOAD_FREQ); 320 n = 1 + (delta / LOAD_FREQ);
319 321
320 active = atomic_long_read(&calc_load_tasks); 322 active = atomic_long_read(&calc_load_tasks);
@@ -324,7 +326,7 @@ static void calc_global_nohz(void)
324 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 326 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
325 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 327 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
326 328
327 calc_load_update += n * LOAD_FREQ; 329 WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
328 } 330 }
329 331
330 /* 332 /*
@@ -352,9 +354,11 @@ static inline void calc_global_nohz(void) { }
352 */ 354 */
353void calc_global_load(unsigned long ticks) 355void calc_global_load(unsigned long ticks)
354{ 356{
357 unsigned long sample_window;
355 long active, delta; 358 long active, delta;
356 359
357 if (time_before(jiffies, calc_load_update + 10)) 360 sample_window = READ_ONCE(calc_load_update);
361 if (time_before(jiffies, sample_window + 10))
358 return; 362 return;
359 363
360 /* 364 /*
@@ -371,7 +375,7 @@ void calc_global_load(unsigned long ticks)
371 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 375 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
372 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 376 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
373 377
374 calc_load_update += LOAD_FREQ; 378 WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
375 379
376 /* 380 /*
377 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. 381 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.