Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner: "From the scheduler departement: - a bunch of sched deadline related fixes which deal with various buglets and corner cases. - two fixes for the loadavg spikes which are caused by the delayed NOHZ accounting" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Use deadline instead of period when calculating overflow sched/deadline: Throttle a constrained deadline task activated after the deadline sched/deadline: Make sure the replenishment timer fires in the next period sched/loadavg: Use {READ,WRITE}_ONCE() for sample window sched/loadavg: Avoid loadavg spikes caused by delayed NO_HZ accounting sched/deadline: Add missing update_rq_clock() in dl_task_timer()
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-03-17 16:19:07 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-03-17 16:19:07 -0400
commit: cd21debe5318842a0bbd38c0327cfde2a3b90d65 (patch)
tree: 5d596c9510eec117ff580ff0aa33f6de028d002b /kernel
parent: b5f13082b19dc09378660226011ebfb033358ea6 (diff)
parent: 2317d5f1c34913bac5971d93d69fb6c31bb74670 (diff)
2 files changed, 69 insertions, 14 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 99b2c33a9fbc..a2ce59015642 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 *
 * This function returns true if:
 *
- *   runtime / (deadline - t) > dl_runtime / dl_period ,
+ *   runtime / (deadline - t) > dl_runtime / dl_deadline ,
 *
 * IOW we can't recycle current parameters.
 *
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
 * task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
 */
 static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
                               struct sched_dl_entity *pi_se, u64 t)
@@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
         * of anything below microseconds resolution is actually fiction
         * (but still we want to give the user that illusion >;).
         */
-        left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+        left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
        right = ((dl_se->deadline - t) >> DL_SCALE) *
                (pi_se->dl_runtime >> DL_SCALE);
@@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
        }
 }
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+        return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
 /*
 * If the entity depleted all its runtime, and if we want it to sleep
 * while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
 * and try to activate it.
 *
 * Notice that it is important for the caller to know if the timer
@@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)
         * that it is actually coming from rq->clock and not from
         * hrtimer's time base reading.
         */
-        act = ns_to_ktime(dl_se->deadline);
+        act = ns_to_ktime(dl_next_period(dl_se));
        now = hrtimer_cb_get_time(timer);
        delta = ktime_to_ns(now) - rq_clock(rq);
        act = ktime_add_ns(act, delta);
@@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                lockdep_unpin_lock(&rq->lock, rf.cookie);
                rq = dl_task_offline_migration(rq, p);
                rf.cookie = lockdep_pin_lock(&rq->lock);
+                update_rq_clock(rq);
                /*
                 * Now that the task has been migrated to the new RQ and we
@@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
        timer->function = dl_task_timer;
 }
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+        if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+            dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+                if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+                        return;
+                dl_se->dl_throttled = 1;
+        }
+}
 static
 int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 {
@@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
        __dequeue_dl_entity(dl_se);
 }
+static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
+{
+        return dl_se->dl_deadline < dl_se->dl_period;
+}
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
        struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
        }
        /*
+         * Check if a constrained deadline task was activated
+         * after the deadline but before the next period.
+         * If that is the case, the task will be throttled and
+         * the replenishment timer will be set to the next period.
+         */
+        if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+                dl_check_constrained_dl(&p->dl);
+        /*
         * If p is throttled, we do nothing. In fact, if it exhausted
         * its budget it needs a replenishment and, since it now is on
         * its rq, the bandwidth timer callback (which clearly has not
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 7296b7308eca..f15fb2bdbc0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -169,7 +169,7 @@ static inline int calc_load_write_idx(void)
         * If the folding window started, make sure we start writing in the
         * next idle-delta.
         */
-        if (!time_before(jiffies, calc_load_update))
+        if (!time_before(jiffies, READ_ONCE(calc_load_update)))
                idx++;
        return idx & 1;
@@ -202,8 +202,9 @@ void calc_load_exit_idle(void)
        struct rq *this_rq = this_rq();
        /*
-         * If we're still before the sample window, we're done.
+         * If we're still before the pending sample window, we're done.
         */
+        this_rq->calc_load_update = READ_ONCE(calc_load_update);
        if (time_before(jiffies, this_rq->calc_load_update))
                return;
@@ -212,7 +213,6 @@ void calc_load_exit_idle(void)
         * accounted through the nohz accounting, so skip the entire deal and
         * sync up for the next window.
         */
-        this_rq->calc_load_update = calc_load_update;
        if (time_before(jiffies, this_rq->calc_load_update + 10))
                this_rq->calc_load_update += LOAD_FREQ;
 }
@@ -308,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp,
 */
 static void calc_global_nohz(void)
 {
+        unsigned long sample_window;
        long delta, active, n;
-        if (!time_before(jiffies, calc_load_update + 10)) {
+        sample_window = READ_ONCE(calc_load_update);
+        if (!time_before(jiffies, sample_window + 10)) {
                /*
                 * Catch-up, fold however many we are behind still
                 */
-                delta = jiffies - calc_load_update - 10;
+                delta = jiffies - sample_window - 10;
                n = 1 + (delta / LOAD_FREQ);
                active = atomic_long_read(&calc_load_tasks);
@@ -324,7 +326,7 @@ static void calc_global_nohz(void)
                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-                calc_load_update += n * LOAD_FREQ;
+                WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
        }
        /*
@@ -352,9 +354,11 @@ static inline void calc_global_nohz(void) { }
 */
 void calc_global_load(unsigned long ticks)
 {
+        unsigned long sample_window;
        long active, delta;
-        if (time_before(jiffies, calc_load_update + 10))
+        sample_window = READ_ONCE(calc_load_update);
+        if (time_before(jiffies, sample_window + 10))
                return;
        /*
@@ -371,7 +375,7 @@ void calc_global_load(unsigned long ticks)
        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-        calc_load_update += LOAD_FREQ;
+        WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
        /*
         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-03-17 16:19:07 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-03-17 16:19:07 -0400
commit	cd21debe5318842a0bbd38c0327cfde2a3b90d65 (patch)
tree	5d596c9510eec117ff580ff0aa33f6de028d002b /kernel
parent	b5f13082b19dc09378660226011ebfb033358ea6 (diff)
parent	2317d5f1c34913bac5971d93d69fb6c31bb74670 (diff)