1 files changed, 223 insertions, 91 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 18fd17172eb..5ad4440f0fc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+static const struct sched_class fair_sched_class;
 /**************************************************************
 * CFS operations on generic schedulable entities:
 */
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+        int depth = 0;
+        for_each_sched_entity(se)
+                depth++;
+        return depth;
+}
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+        int se_depth, pse_depth;
+        /*
+         * preemption test can be made between sibling entities who are in the
+         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+         * both tasks until we find their ancestors who are siblings of common
+         * parent.
+         */
+        /* First walk up until both entities are at same depth */
+        se_depth = depth_se(*se);
+        pse_depth = depth_se(*pse);
+        while (se_depth > pse_depth) {
+                se_depth--;
+                *se = parent_entity(*se);
+        }
+        while (pse_depth > se_depth) {
+                pse_depth--;
+                *pse = parent_entity(*pse);
+        }
+        while (!is_same_group(*se, *pse)) {
+                *se = parent_entity(*se);
+                *pse = parent_entity(*pse);
+        }
+}
 #else   /* CONFIG_FAIR_GROUP_SCHED */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return NULL;
 }
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return se->vruntime - cfs_rq->min_vruntime;
 }
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+        u64 vruntime = cfs_rq->min_vruntime;
+        if (cfs_rq->curr)
+                vruntime = cfs_rq->curr->vruntime;
+        if (cfs_rq->rb_leftmost) {
+                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+                                                   struct sched_entity,
+                                                   run_node);
+                if (vruntime == cfs_rq->min_vruntime)
+                        vruntime = se->vruntime;
+                else
+                        vruntime = min_vruntime(vruntime, se->vruntime);
+        }
+        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
 /*
 * Enqueue an entity into the rb-tree:
 */
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * Maintain a cache of leftmost tree entries (it is frequently
         * used):
         */
-        if (leftmost) {
+        if (leftmost)
                cfs_rq->rb_leftmost = &se->run_node;
-                /*
-                 * maintain cfs_rq->min_vruntime to be a monotonic increasing
-                 * value tracking the leftmost vruntime in the tree.
-                 */
-                cfs_rq->min_vruntime =
-                        max_vruntime(cfs_rq->min_vruntime, se->vruntime);
-        }
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        if (cfs_rq->rb_leftmost == &se->run_node) {
                struct rb_node *next_node;
-                struct sched_entity *next;
                next_node = rb_next(&se->run_node);
                cfs_rq->rb_leftmost = next_node;
-                if (next_node) {
-                        next = rb_entry(next_node,
-                                        struct sched_entity, run_node);
-                        cfs_rq->min_vruntime =
-                                max_vruntime(cfs_rq->min_vruntime,
-                                             next->vruntime);
-                }
        }
-        if (cfs_rq->next == se)
-                cfs_rq->next = NULL;
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
-        return cfs_rq->rb_leftmost;
-}
 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+        struct rb_node *left = cfs_rq->rb_leftmost;
+        if (!left)
+                return NULL;
+        return rb_entry(left, struct sched_entity, run_node);
 }
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 /*
- * delta *= w / rw
+ * delta *= P[w / rw]
 */
 static inline unsigned long
 calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
 }
 /*
- * delta *= rw / w
+ * delta /= w
 */
 static inline unsigned long
 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
-        for_each_sched_entity(se) {
+        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta,
+                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
-                                cfs_rq_of(se)->load.weight, &se->load);
-        }
        return delta;
 }
@@ -386,26 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
- * s = p*w/rw
+ * s = p*P[w/rw]
 */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+        unsigned long nr_running = cfs_rq->nr_running;
+        if (unlikely(!se->on_rq))
+                nr_running++;
+        return calc_delta_weight(__sched_period(nr_running), se);
 }
 /*
 * We calculate the vruntime slice of a to be inserted task
 *
- * vs = s*rw/w = p
+ * vs = s/w
 */
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        unsigned long nr_running = cfs_rq->nr_running;
+        return calc_delta_fair(sched_slice(cfs_rq, se), se);
-        if (!se->on_rq)
-                nr_running++;
-        return __sched_period(nr_running);
 }
 /*
@@ -424,6 +474,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        schedstat_add(cfs_rq, exec_clock, delta_exec);
        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
+        update_min_vruntime(cfs_rq);
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -441,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
         * overflow on 32 bits):
         */
        delta_exec = (unsigned long)(now - curr->exec_start);
+        if (!delta_exec)
+                return;
        __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
@@ -449,6 +502,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                struct task_struct *curtask = task_of(curr);
                cpuacct_charge(curtask, delta_exec);
+                account_group_exec_runtime(curtask, delta_exec);
        }
 }
@@ -612,13 +666,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-        u64 vruntime;
+        u64 vruntime = cfs_rq->min_vruntime;
-        if (first_fair(cfs_rq)) {
-                vruntime = min_vruntime(cfs_rq->min_vruntime,
-                                __pick_next_entity(cfs_rq)->vruntime);
-        } else
-                vruntime = cfs_rq->min_vruntime;
        /*
         * The 'current' period is already promised to the current tasks,
@@ -627,7 +675,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
         * stays open at the end.
         */
        if (initial && sched_feat(START_DEBIT))
-                vruntime += sched_vslice_add(cfs_rq, se);
+                vruntime += sched_vslice(cfs_rq, se);
        if (!initial) {
                /* sleeps upto a single latency don't count. */
@@ -670,6 +718,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                __enqueue_entity(cfs_rq, se);
 }
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        if (cfs_rq->last == se)
+                cfs_rq->last = NULL;
+        if (cfs_rq->next == se)
+                cfs_rq->next = NULL;
+}
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -692,9 +749,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 #endif
        }
+        clear_buddies(cfs_rq, se);
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
+        update_min_vruntime(cfs_rq);
 }
 /*
@@ -741,29 +801,18 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
-static struct sched_entity *
+static int
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-{
-        struct rq *rq = rq_of(cfs_rq);
-        u64 pair_slice = rq->clock - cfs_rq->pair_start;
-        if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
-                cfs_rq->pair_start = rq->clock;
-                return se;
-        }
-        return cfs_rq->next;
-}
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *se = NULL;
+        struct sched_entity *se = __pick_next_entity(cfs_rq);
-        if (first_fair(cfs_rq)) {
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
-                se = __pick_next_entity(cfs_rq);
+                return cfs_rq->next;
-                se = pick_next(cfs_rq, se);
-                set_next_entity(cfs_rq, se);
+        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
-        }
+                return cfs_rq->last;
        return se;
 }
@@ -848,11 +897,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                hrtick_start(rq, delta);
        }
 }
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        if (curr->sched_class != &fair_sched_class)
+                return;
+        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+                hrtick_start_fair(rq, curr);
+}
 #else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
 }
+static inline void hrtick_update(struct rq *rq)
+{
+}
 #endif
 /*
@@ -873,7 +942,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
                wakeup = 1;
        }
-        hrtick_start_fair(rq, rq->curr);
+        hrtick_update(rq);
 }
 /*
@@ -895,7 +964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
                sleep = 1;
        }
-        hrtick_start_fair(rq, rq->curr);
+        hrtick_update(rq);
 }
 /*
@@ -915,6 +984,8 @@ static void yield_task_fair(struct rq *rq)
        if (unlikely(cfs_rq->nr_running == 1))
                return;
+        clear_buddies(cfs_rq, se);
        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
                update_rq_clock(rq);
                /*
@@ -1001,8 +1072,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 #ifdef CONFIG_SMP
-static const struct sched_class fair_sched_class;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
 * effective_load() calculates the load change as seen from the root_task_group
@@ -1103,10 +1172,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
-        if (!sync && sched_feat(SYNC_WAKEUPS) &&
+        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-            curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                        p->se.avg_overlap > sysctl_sched_migration_cost))
-            p->se.avg_overlap < sysctl_sched_migration_cost)
+                sync = 0;
-                sync = 1;
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -1225,33 +1293,87 @@ static unsigned long wakeup_gran(struct sched_entity *se)
         * More easily preempt - nice tasks, while not making it harder for
         * + nice tasks.
         */
-        if (sched_feat(ASYM_GRAN))
+        if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
-                gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
+                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
 /*
+ * Should 'se' preempt 'curr'.
+ *
+ *             |s1
+ *        |s2
+ *   |s3
+ *         g
+ *      |<--->|c
+ *
+ *  w(c, s1) = -1
+ *  w(c, s2) =  0
+ *  w(c, s3) =  1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+        s64 gran, vdiff = curr->vruntime - se->vruntime;
+        if (vdiff <= 0)
+                return -1;
+        gran = wakeup_gran(curr);
+        if (vdiff > gran)
+                return 1;
+        return 0;
+}
+static void set_last_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->last = se;
+}
+static void set_next_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->next = se;
+}
+/*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
-        s64 delta_exec;
+        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        update_curr(cfs_rq);
        if (unlikely(rt_prio(p->prio))) {
-                update_rq_clock(rq);
-                update_curr(cfs_rq);
                resched_task(curr);
                return;
        }
+        if (unlikely(p->sched_class != &fair_sched_class))
+                return;
        if (unlikely(se == pse))
                return;
-        cfs_rq_of(pse)->next = pse;
+        /*
+         * Only set the backward buddy when the current task is still on the
+         * rq. This can happen when a wakeup gets interleaved with schedule on
+         * the ->pre_schedule() or idle_balance() point, either of which can
+         * drop the rq lock.
+         *
+         * Also, during early boot the idle thread is in the fair class, for
+         * obvious reasons its a bad idea to schedule back to the idle thread.
+         */
+        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+                set_last_buddy(se);
+        set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1277,9 +1399,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
                return;
        }
-        delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+        find_matching_se(&se, &pse);
-        if (delta_exec > wakeup_gran(pse))
-                resched_task(curr);
+        while (se) {
+                BUG_ON(!pse);
+                if (wakeup_preempt_entity(se, pse) == 1) {
+                        resched_task(curr);
+                        break;
+                }
+                se = parent_entity(se);
+                pse = parent_entity(pse);
+        }
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1293,6 +1425,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
+                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
@@ -1575,9 +1708,6 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_fair,
-#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_wakeup,
@@ -1585,6 +1715,8 @@ static const struct sched_class fair_sched_class = {
        .put_prev_task          = put_prev_task_fair,
 #ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_fair,
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
 #endif

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd17172eb..5ad4440f0fc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73		73
74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75		75
		76	static const struct sched_class fair_sched_class;
		77
76	/**************************************************************	78	/**************************************************************
77	* CFS operations on generic schedulable entities:	79	* CFS operations on generic schedulable entities:
78	*/	80	*/
@@ -141,6 +143,49 @@ static inline struct sched_entity parent_entity(struct sched_entity se)
141	return se->parent;	143	return se->parent;
142	}	144	}
143		145
		146	/* return depth at which a sched entity is present in the hierarchy */
		147	static inline int depth_se(struct sched_entity *se)
		148	{
		149	int depth = 0;
		150
		151	for_each_sched_entity(se)
		152	depth++;
		153
		154	return depth;
		155	}
		156
		157	static void
		158	find_matching_se(struct sched_entity se, struct sched_entity pse)
		159	{
		160	int se_depth, pse_depth;
		161
		162	/*
		163	* preemption test can be made between sibling entities who are in the
		164	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
		165	* both tasks until we find their ancestors who are siblings of common
		166	* parent.
		167	*/
		168
		169	/* First walk up until both entities are at same depth */
		170	se_depth = depth_se(*se);
		171	pse_depth = depth_se(*pse);
		172
		173	while (se_depth > pse_depth) {
		174	se_depth--;
		175	se = parent_entity(se);
		176	}
		177
		178	while (pse_depth > se_depth) {
		179	pse_depth--;
		180	pse = parent_entity(pse);
		181	}
		182
		183	while (!is_same_group(se, pse)) {
		184	se = parent_entity(se);
		185	pse = parent_entity(pse);
		186	}
		187	}
		188
144	#else /* CONFIG_FAIR_GROUP_SCHED */	189	#else /* CONFIG_FAIR_GROUP_SCHED */
145		190
146	static inline struct rq rq_of(struct cfs_rq cfs_rq)	191	static inline struct rq rq_of(struct cfs_rq cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity parent_entity(struct sched_entity se)
191	return NULL;	236	return NULL;
192	}	237	}
193		238
		239	static inline void
		240	find_matching_se(struct sched_entity se, struct sched_entity pse)
		241	{
		242	}
		243
194	#endif /* CONFIG_FAIR_GROUP_SCHED */	244	#endif /* CONFIG_FAIR_GROUP_SCHED */
195		245
196		246
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq cfs_rq, struct sched_entity se)
221	return se->vruntime - cfs_rq->min_vruntime;	271	return se->vruntime - cfs_rq->min_vruntime;
222	}	272	}
223		273
		274	static void update_min_vruntime(struct cfs_rq *cfs_rq)
		275	{
		276	u64 vruntime = cfs_rq->min_vruntime;
		277
		278	if (cfs_rq->curr)
		279	vruntime = cfs_rq->curr->vruntime;
		280
		281	if (cfs_rq->rb_leftmost) {
		282	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
		283	struct sched_entity,
		284	run_node);
		285
		286	if (vruntime == cfs_rq->min_vruntime)
		287	vruntime = se->vruntime;
		288	else
		289	vruntime = min_vruntime(vruntime, se->vruntime);
		290	}
		291
		292	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
		293	}
		294
224	/*	295	/*
225	* Enqueue an entity into the rb-tree:	296	* Enqueue an entity into the rb-tree:
226	*/	297	*/
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
254	* Maintain a cache of leftmost tree entries (it is frequently	325	* Maintain a cache of leftmost tree entries (it is frequently
255	* used):	326	* used):
256	*/	327	*/
257	if (leftmost) {	328	if (leftmost)
258	cfs_rq->rb_leftmost = &se->run_node;	329	cfs_rq->rb_leftmost = &se->run_node;
259	/*
260	* maintain cfs_rq->min_vruntime to be a monotonic increasing
261	* value tracking the leftmost vruntime in the tree.
262	*/
263	cfs_rq->min_vruntime =
264	max_vruntime(cfs_rq->min_vruntime, se->vruntime);
265	}
266		330
267	rb_link_node(&se->run_node, parent, link);	331	rb_link_node(&se->run_node, parent, link);
268	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);	332	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
272	{	336	{
273	if (cfs_rq->rb_leftmost == &se->run_node) {	337	if (cfs_rq->rb_leftmost == &se->run_node) {
274	struct rb_node *next_node;	338	struct rb_node *next_node;
275	struct sched_entity *next;
276		339
277	next_node = rb_next(&se->run_node);	340	next_node = rb_next(&se->run_node);
278	cfs_rq->rb_leftmost = next_node;	341	cfs_rq->rb_leftmost = next_node;
279
280	if (next_node) {
281	next = rb_entry(next_node,
282	struct sched_entity, run_node);
283	cfs_rq->min_vruntime =
284	max_vruntime(cfs_rq->min_vruntime,
285	next->vruntime);
286	}
287	}	342	}
288		343
289	if (cfs_rq->next == se)
290	cfs_rq->next = NULL;
291
292	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);	344	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
293	}	345	}
294		346
295	static inline struct rb_node first_fair(struct cfs_rq cfs_rq)
296	{
297	return cfs_rq->rb_leftmost;
298	}
299
300	static struct sched_entity __pick_next_entity(struct cfs_rq cfs_rq)	347	static struct sched_entity __pick_next_entity(struct cfs_rq cfs_rq)
301	{	348	{
302	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);	349	struct rb_node *left = cfs_rq->rb_leftmost;
		350
		351	if (!left)
		352	return NULL;
		353
		354	return rb_entry(left, struct sched_entity, run_node);
303	}	355	}
304		356
305	static inline struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)	357	static struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
306	{	358	{
307	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);	359	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
308		360
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334	#endif	386	#endif
335		387
336	/*	388	/*
337	* delta *= w / rw	389	* delta *= P[w / rw]
338	*/	390	*/
339	static inline unsigned long	391	static inline unsigned long
340	calc_delta_weight(unsigned long delta, struct sched_entity *se)	392	calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
348	}	400	}
349		401
350	/*	402	/*
351	* delta *= rw / w	403	* delta /= w
352	*/	404	*/
353	static inline unsigned long	405	static inline unsigned long
354	calc_delta_fair(unsigned long delta, struct sched_entity *se)	406	calc_delta_fair(unsigned long delta, struct sched_entity *se)
355	{	407	{
356	for_each_sched_entity(se) {	408	if (unlikely(se->load.weight != NICE_0_LOAD))
357	delta = calc_delta_mine(delta,	409	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
358	cfs_rq_of(se)->load.weight, &se->load);
359	}
360		410
361	return delta;	411	return delta;
362	}	412	}
@@ -386,26 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
386	* We calculate the wall-time slice from the period by taking a part	436	* We calculate the wall-time slice from the period by taking a part
387	* proportional to the weight.	437	* proportional to the weight.
388	*	438	*
389	* s = p*w/rw	439	* s = p*P[w/rw]
390	*/	440	*/
391	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)	441	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
392	{	442	{
393	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);	443	unsigned long nr_running = cfs_rq->nr_running;
		444
		445	if (unlikely(!se->on_rq))
		446	nr_running++;
		447
		448	return calc_delta_weight(__sched_period(nr_running), se);
394	}	449	}
395		450
396	/*	451	/*
397	* We calculate the vruntime slice of a to be inserted task	452	* We calculate the vruntime slice of a to be inserted task
398	*	453	*
399	* vs = s*rw/w = p	454	* vs = s/w
400	*/	455	*/
401	static u64 sched_vslice_add(struct cfs_rq cfs_rq, struct sched_entity se)	456	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
402	{	457	{
403	unsigned long nr_running = cfs_rq->nr_running;	458	return calc_delta_fair(sched_slice(cfs_rq, se), se);
404
405	if (!se->on_rq)
406	nr_running++;
407
408	return __sched_period(nr_running);
409	}	459	}
410		460
411	/*	461	/*
@@ -424,6 +474,7 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
424	schedstat_add(cfs_rq, exec_clock, delta_exec);	474	schedstat_add(cfs_rq, exec_clock, delta_exec);
425	delta_exec_weighted = calc_delta_fair(delta_exec, curr);	475	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
426	curr->vruntime += delta_exec_weighted;	476	curr->vruntime += delta_exec_weighted;
		477	update_min_vruntime(cfs_rq);
427	}	478	}
428		479
429	static void update_curr(struct cfs_rq *cfs_rq)	480	static void update_curr(struct cfs_rq *cfs_rq)
@@ -441,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
441	* overflow on 32 bits):	492	* overflow on 32 bits):
442	*/	493	*/
443	delta_exec = (unsigned long)(now - curr->exec_start);	494	delta_exec = (unsigned long)(now - curr->exec_start);
		495	if (!delta_exec)
		496	return;
444		497
445	__update_curr(cfs_rq, curr, delta_exec);	498	__update_curr(cfs_rq, curr, delta_exec);
446	curr->exec_start = now;	499	curr->exec_start = now;
@@ -449,6 +502,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
449	struct task_struct *curtask = task_of(curr);	502	struct task_struct *curtask = task_of(curr);
450		503
451	cpuacct_charge(curtask, delta_exec);	504	cpuacct_charge(curtask, delta_exec);
		505	account_group_exec_runtime(curtask, delta_exec);
452	}	506	}
453	}	507	}
454		508
@@ -612,13 +666,7 @@ static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
612	static void	666	static void
613	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)	667	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
614	{	668	{
615	u64 vruntime;	669	u64 vruntime = cfs_rq->min_vruntime;
616
617	if (first_fair(cfs_rq)) {
618	vruntime = min_vruntime(cfs_rq->min_vruntime,
619	__pick_next_entity(cfs_rq)->vruntime);
620	} else
621	vruntime = cfs_rq->min_vruntime;
622		670
623	/*	671	/*
624	* The 'current' period is already promised to the current tasks,	672	* The 'current' period is already promised to the current tasks,
@@ -627,7 +675,7 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
627	* stays open at the end.	675	* stays open at the end.
628	*/	676	*/
629	if (initial && sched_feat(START_DEBIT))	677	if (initial && sched_feat(START_DEBIT))
630	vruntime += sched_vslice_add(cfs_rq, se);	678	vruntime += sched_vslice(cfs_rq, se);
631		679
632	if (!initial) {	680	if (!initial) {
633	/* sleeps upto a single latency don't count. */	681	/* sleeps upto a single latency don't count. */
@@ -670,6 +718,15 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int wakeup)
670	__enqueue_entity(cfs_rq, se);	718	__enqueue_entity(cfs_rq, se);
671	}	719	}
672		720
		721	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
		722	{
		723	if (cfs_rq->last == se)
		724	cfs_rq->last = NULL;
		725
		726	if (cfs_rq->next == se)
		727	cfs_rq->next = NULL;
		728	}
		729
673	static void	730	static void
674	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int sleep)	731	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int sleep)
675	{	732	{
@@ -692,9 +749,12 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int sleep)
692	#endif	749	#endif
693	}	750	}
694		751
		752	clear_buddies(cfs_rq, se);
		753
695	if (se != cfs_rq->curr)	754	if (se != cfs_rq->curr)
696	__dequeue_entity(cfs_rq, se);	755	__dequeue_entity(cfs_rq, se);
697	account_entity_dequeue(cfs_rq, se);	756	account_entity_dequeue(cfs_rq, se);
		757	update_min_vruntime(cfs_rq);
698	}	758	}
699		759
700	/*	760	/*
@@ -741,29 +801,18 @@ set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
741	se->prev_sum_exec_runtime = se->sum_exec_runtime;	801	se->prev_sum_exec_runtime = se->sum_exec_runtime;
742	}	802	}
743		803
744	static struct sched_entity *	804	static int
745	pick_next(struct cfs_rq cfs_rq, struct sched_entity se)	805	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
746	{
747	struct rq *rq = rq_of(cfs_rq);
748	u64 pair_slice = rq->clock - cfs_rq->pair_start;
749
750	if (!cfs_rq->next \|\| pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
751	cfs_rq->pair_start = rq->clock;
752	return se;
753	}
754
755	return cfs_rq->next;
756	}
757		806
758	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)	807	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
759	{	808	{
760	struct sched_entity *se = NULL;	809	struct sched_entity *se = __pick_next_entity(cfs_rq);
761		810
762	if (first_fair(cfs_rq)) {	811	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
763	se = __pick_next_entity(cfs_rq);	812	return cfs_rq->next;
764	se = pick_next(cfs_rq, se);	813
765	set_next_entity(cfs_rq, se);	814	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
766	}	815	return cfs_rq->last;
767		816
768	return se;	817	return se;
769	}	818	}
@@ -848,11 +897,31 @@ static void hrtick_start_fair(struct rq rq, struct task_struct p)
848	hrtick_start(rq, delta);	897	hrtick_start(rq, delta);
849	}	898	}
850	}	899	}
		900
		901	/*
		902	* called from enqueue/dequeue and updates the hrtick when the
		903	* current task is from our class and nr_running is low enough
		904	* to matter.
		905	*/
		906	static void hrtick_update(struct rq *rq)
		907	{
		908	struct task_struct *curr = rq->curr;
		909
		910	if (curr->sched_class != &fair_sched_class)
		911	return;
		912
		913	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
		914	hrtick_start_fair(rq, curr);
		915	}
851	#else /* !CONFIG_SCHED_HRTICK */	916	#else /* !CONFIG_SCHED_HRTICK */
852	static inline void	917	static inline void
853	hrtick_start_fair(struct rq rq, struct task_struct p)	918	hrtick_start_fair(struct rq rq, struct task_struct p)
854	{	919	{
855	}	920	}
		921
		922	static inline void hrtick_update(struct rq *rq)
		923	{
		924	}
856	#endif	925	#endif
857		926
858	/*	927	/*
@@ -873,7 +942,7 @@ static void enqueue_task_fair(struct rq rq, struct task_struct p, int wakeup)
873	wakeup = 1;	942	wakeup = 1;
874	}	943	}
875		944
876	hrtick_start_fair(rq, rq->curr);	945	hrtick_update(rq);
877	}	946	}
878		947
879	/*	948	/*
@@ -895,7 +964,7 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int sleep)
895	sleep = 1;	964	sleep = 1;
896	}	965	}
897		966
898	hrtick_start_fair(rq, rq->curr);	967	hrtick_update(rq);
899	}	968	}
900		969
901	/*	970	/*
@@ -915,6 +984,8 @@ static void yield_task_fair(struct rq *rq)
915	if (unlikely(cfs_rq->nr_running == 1))	984	if (unlikely(cfs_rq->nr_running == 1))
916	return;	985	return;
917		986
		987	clear_buddies(cfs_rq, se);
		988
918	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {	989	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
919	update_rq_clock(rq);	990	update_rq_clock(rq);
920	/*	991	/*
@@ -1001,8 +1072,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1001		1072
1002	#ifdef CONFIG_SMP	1073	#ifdef CONFIG_SMP
1003		1074
1004	static const struct sched_class fair_sched_class;
1005
1006	#ifdef CONFIG_FAIR_GROUP_SCHED	1075	#ifdef CONFIG_FAIR_GROUP_SCHED
1007	/*	1076	/*
1008	* effective_load() calculates the load change as seen from the root_task_group	1077	* effective_load() calculates the load change as seen from the root_task_group
@@ -1103,10 +1172,9 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1103	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))	1172	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))
1104	return 0;	1173	return 0;
1105		1174
1106	if (!sync && sched_feat(SYNC_WAKEUPS) &&	1175	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost \|\|
1107	curr->se.avg_overlap < sysctl_sched_migration_cost &&	1176	p->se.avg_overlap > sysctl_sched_migration_cost))
1108	p->se.avg_overlap < sysctl_sched_migration_cost)	1177	sync = 0;
1109	sync = 1;
1110		1178
1111	/*	1179	/*
1112	* If sync wakeup then subtract the (maximum possible)	1180	* If sync wakeup then subtract the (maximum possible)
@@ -1225,33 +1293,87 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1225	* More easily preempt - nice tasks, while not making it harder for	1293	* More easily preempt - nice tasks, while not making it harder for
1226	* + nice tasks.	1294	* + nice tasks.
1227	*/	1295	*/
1228	if (sched_feat(ASYM_GRAN))	1296	if (!sched_feat(ASYM_GRAN) \|\| se->load.weight > NICE_0_LOAD)
1229	gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);	1297	gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1230		1298
1231	return gran;	1299	return gran;
1232	}	1300	}
1233		1301
1234	/*	1302	/*
		1303	* Should 'se' preempt 'curr'.
		1304	*
		1305	* \|s1
		1306	* \|s2
		1307	* \|s3
		1308	* g
		1309	* \|<--->\|c
		1310	*
		1311	* w(c, s1) = -1
		1312	* w(c, s2) = 0
		1313	* w(c, s3) = 1
		1314	*
		1315	*/
		1316	static int
		1317	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
		1318	{
		1319	s64 gran, vdiff = curr->vruntime - se->vruntime;
		1320
		1321	if (vdiff <= 0)
		1322	return -1;
		1323
		1324	gran = wakeup_gran(curr);
		1325	if (vdiff > gran)
		1326	return 1;
		1327
		1328	return 0;
		1329	}
		1330
		1331	static void set_last_buddy(struct sched_entity *se)
		1332	{
		1333	for_each_sched_entity(se)
		1334	cfs_rq_of(se)->last = se;
		1335	}
		1336
		1337	static void set_next_buddy(struct sched_entity *se)
		1338	{
		1339	for_each_sched_entity(se)
		1340	cfs_rq_of(se)->next = se;
		1341	}
		1342
		1343	/*
1235	* Preempt the current task with a newly woken task if needed:	1344	* Preempt the current task with a newly woken task if needed:
1236	*/	1345	*/
1237	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)	1346	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1238	{	1347	{
1239	struct task_struct *curr = rq->curr;	1348	struct task_struct *curr = rq->curr;
1240	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1241	struct sched_entity se = &curr->se, pse = &p->se;	1349	struct sched_entity se = &curr->se, pse = &p->se;
1242	s64 delta_exec;	1350	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
		1351
		1352	update_curr(cfs_rq);
1243		1353
1244	if (unlikely(rt_prio(p->prio))) {	1354	if (unlikely(rt_prio(p->prio))) {
1245	update_rq_clock(rq);
1246	update_curr(cfs_rq);
1247	resched_task(curr);	1355	resched_task(curr);
1248	return;	1356	return;
1249	}	1357	}
1250		1358
		1359	if (unlikely(p->sched_class != &fair_sched_class))
		1360	return;
		1361
1251	if (unlikely(se == pse))	1362	if (unlikely(se == pse))
1252	return;	1363	return;
1253		1364
1254	cfs_rq_of(pse)->next = pse;	1365	/*
		1366	* Only set the backward buddy when the current task is still on the
		1367	* rq. This can happen when a wakeup gets interleaved with schedule on
		1368	* the ->pre_schedule() or idle_balance() point, either of which can
		1369	* drop the rq lock.
		1370	*
		1371	* Also, during early boot the idle thread is in the fair class, for
		1372	* obvious reasons its a bad idea to schedule back to the idle thread.
		1373	*/
		1374	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
		1375	set_last_buddy(se);
		1376	set_next_buddy(pse);
1255		1377
1256	/*	1378	/*
1257	* We can come here with TIF_NEED_RESCHED already set from new task	1379	* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1277,9 +1399,19 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1277	return;	1399	return;
1278	}	1400	}
1279		1401
1280	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;	1402	find_matching_se(&se, &pse);
1281	if (delta_exec > wakeup_gran(pse))	1403
1282	resched_task(curr);	1404	while (se) {
		1405	BUG_ON(!pse);
		1406
		1407	if (wakeup_preempt_entity(se, pse) == 1) {
		1408	resched_task(curr);
		1409	break;
		1410	}
		1411
		1412	se = parent_entity(se);
		1413	pse = parent_entity(pse);
		1414	}
1283	}	1415	}
1284		1416
1285	static struct task_struct pick_next_task_fair(struct rq rq)	1417	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1293,6 +1425,7 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1293		1425
1294	do {	1426	do {
1295	se = pick_next_entity(cfs_rq);	1427	se = pick_next_entity(cfs_rq);
		1428	set_next_entity(cfs_rq, se);
1296	cfs_rq = group_cfs_rq(se);	1429	cfs_rq = group_cfs_rq(se);
1297	} while (cfs_rq);	1430	} while (cfs_rq);
1298		1431
@@ -1575,9 +1708,6 @@ static const struct sched_class fair_sched_class = {
1575	.enqueue_task = enqueue_task_fair,	1708	.enqueue_task = enqueue_task_fair,
1576	.dequeue_task = dequeue_task_fair,	1709	.dequeue_task = dequeue_task_fair,
1577	.yield_task = yield_task_fair,	1710	.yield_task = yield_task_fair,
1578	#ifdef CONFIG_SMP
1579	.select_task_rq = select_task_rq_fair,
1580	#endif /* CONFIG_SMP */
1581		1711
1582	.check_preempt_curr = check_preempt_wakeup,	1712	.check_preempt_curr = check_preempt_wakeup,
1583		1713
@@ -1585,6 +1715,8 @@ static const struct sched_class fair_sched_class = {
1585	.put_prev_task = put_prev_task_fair,	1715	.put_prev_task = put_prev_task_fair,
1586		1716
1587	#ifdef CONFIG_SMP	1717	#ifdef CONFIG_SMP
		1718	.select_task_rq = select_task_rq_fair,
		1719
1588	.load_balance = load_balance_fair,	1720	.load_balance = load_balance_fair,
1589	.move_one_task = move_one_task_fair,	1721	.move_one_task = move_one_task_fair,
1590	#endif	1722	#endif