1 files changed, 213 insertions, 89 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 18fd17172eb6..51aa3e102acb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+static const struct sched_class fair_sched_class;
 /**************************************************************
 * CFS operations on generic schedulable entities:
 */
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+        int depth = 0;
+        for_each_sched_entity(se)
+                depth++;
+        return depth;
+}
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+        int se_depth, pse_depth;
+        /*
+         * preemption test can be made between sibling entities who are in the
+         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+         * both tasks until we find their ancestors who are siblings of common
+         * parent.
+         */
+        /* First walk up until both entities are at same depth */
+        se_depth = depth_se(*se);
+        pse_depth = depth_se(*pse);
+        while (se_depth > pse_depth) {
+                se_depth--;
+                *se = parent_entity(*se);
+        }
+        while (pse_depth > se_depth) {
+                pse_depth--;
+                *pse = parent_entity(*pse);
+        }
+        while (!is_same_group(*se, *pse)) {
+                *se = parent_entity(*se);
+                *pse = parent_entity(*pse);
+        }
+}
 #else   /* CONFIG_FAIR_GROUP_SCHED */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return NULL;
 }
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return se->vruntime - cfs_rq->min_vruntime;
 }
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+        u64 vruntime = cfs_rq->min_vruntime;
+        if (cfs_rq->curr)
+                vruntime = cfs_rq->curr->vruntime;
+        if (cfs_rq->rb_leftmost) {
+                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+                                                   struct sched_entity,
+                                                   run_node);
+                if (vruntime == cfs_rq->min_vruntime)
+                        vruntime = se->vruntime;
+                else
+                        vruntime = min_vruntime(vruntime, se->vruntime);
+        }
+        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
 /*
 * Enqueue an entity into the rb-tree:
 */
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * Maintain a cache of leftmost tree entries (it is frequently
         * used):
         */
-        if (leftmost) {
+        if (leftmost)
                cfs_rq->rb_leftmost = &se->run_node;
-                /*
-                 * maintain cfs_rq->min_vruntime to be a monotonic increasing
-                 * value tracking the leftmost vruntime in the tree.
-                 */
-                cfs_rq->min_vruntime =
-                        max_vruntime(cfs_rq->min_vruntime, se->vruntime);
-        }
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        if (cfs_rq->rb_leftmost == &se->run_node) {
                struct rb_node *next_node;
-                struct sched_entity *next;
                next_node = rb_next(&se->run_node);
                cfs_rq->rb_leftmost = next_node;
-                if (next_node) {
-                        next = rb_entry(next_node,
-                                        struct sched_entity, run_node);
-                        cfs_rq->min_vruntime =
-                                max_vruntime(cfs_rq->min_vruntime,
-                                             next->vruntime);
-                }
        }
-        if (cfs_rq->next == se)
-                cfs_rq->next = NULL;
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
-        return cfs_rq->rb_leftmost;
-}
 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+        struct rb_node *left = cfs_rq->rb_leftmost;
+        if (!left)
+                return NULL;
+        return rb_entry(left, struct sched_entity, run_node);
 }
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 /*
- * delta *= w / rw
+ * delta *= P[w / rw]
 */
 static inline unsigned long
 calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
 }
 /*
- * delta *= rw / w
+ * delta /= w
 */
 static inline unsigned long
 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
-        for_each_sched_entity(se) {
+        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta,
+                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
-                                cfs_rq_of(se)->load.weight, &se->load);
-        }
        return delta;
 }
@@ -386,26 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
- * s = p*w/rw
+ * s = p*P[w/rw]
 */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+        unsigned long nr_running = cfs_rq->nr_running;
+        if (unlikely(!se->on_rq))
+                nr_running++;
+        return calc_delta_weight(__sched_period(nr_running), se);
 }
 /*
 * We calculate the vruntime slice of a to be inserted task
 *
- * vs = s*rw/w = p
+ * vs = s/w
 */
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        unsigned long nr_running = cfs_rq->nr_running;
+        return calc_delta_fair(sched_slice(cfs_rq, se), se);
-        if (!se->on_rq)
-                nr_running++;
-        return __sched_period(nr_running);
 }
 /*
@@ -424,6 +474,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        schedstat_add(cfs_rq, exec_clock, delta_exec);
        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
+        update_min_vruntime(cfs_rq);
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -449,6 +500,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                struct task_struct *curtask = task_of(curr);
                cpuacct_charge(curtask, delta_exec);
+                account_group_exec_runtime(curtask, delta_exec);
        }
 }
@@ -612,13 +664,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-        u64 vruntime;
+        u64 vruntime = cfs_rq->min_vruntime;
-        if (first_fair(cfs_rq)) {
-                vruntime = min_vruntime(cfs_rq->min_vruntime,
-                                __pick_next_entity(cfs_rq)->vruntime);
-        } else
-                vruntime = cfs_rq->min_vruntime;
        /*
         * The 'current' period is already promised to the current tasks,
@@ -627,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
         * stays open at the end.
         */
        if (initial && sched_feat(START_DEBIT))
-                vruntime += sched_vslice_add(cfs_rq, se);
+                vruntime += sched_vslice(cfs_rq, se);
        if (!initial) {
                /* sleeps upto a single latency don't count. */
@@ -692,9 +738,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 #endif
        }
+        if (cfs_rq->last == se)
+                cfs_rq->last = NULL;
+        if (cfs_rq->next == se)
+                cfs_rq->next = NULL;
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
+        update_min_vruntime(cfs_rq);
 }
 /*
@@ -741,29 +794,18 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
-static struct sched_entity *
+static int
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-{
-        struct rq *rq = rq_of(cfs_rq);
-        u64 pair_slice = rq->clock - cfs_rq->pair_start;
-        if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
-                cfs_rq->pair_start = rq->clock;
-                return se;
-        }
-        return cfs_rq->next;
-}
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *se = NULL;
+        struct sched_entity *se = __pick_next_entity(cfs_rq);
-        if (first_fair(cfs_rq)) {
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
-                se = __pick_next_entity(cfs_rq);
+                return cfs_rq->next;
-                se = pick_next(cfs_rq, se);
-                set_next_entity(cfs_rq, se);
+        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
-        }
+                return cfs_rq->last;
        return se;
 }
@@ -848,11 +890,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                hrtick_start(rq, delta);
        }
 }
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        if (curr->sched_class != &fair_sched_class)
+                return;
+        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+                hrtick_start_fair(rq, curr);
+}
 #else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
 }
+static inline void hrtick_update(struct rq *rq)
+{
+}
 #endif
 /*
@@ -873,7 +935,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
                wakeup = 1;
        }
-        hrtick_start_fair(rq, rq->curr);
+        hrtick_update(rq);
 }
 /*
@@ -895,7 +957,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
                sleep = 1;
        }
-        hrtick_start_fair(rq, rq->curr);
+        hrtick_update(rq);
 }
 /*
@@ -1001,8 +1063,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 #ifdef CONFIG_SMP
-static const struct sched_class fair_sched_class;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
 * effective_load() calculates the load change as seen from the root_task_group
@@ -1103,10 +1163,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
-        if (!sync && sched_feat(SYNC_WAKEUPS) &&
+        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-            curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                        p->se.avg_overlap > sysctl_sched_migration_cost))
-            p->se.avg_overlap < sysctl_sched_migration_cost)
+                sync = 0;
-                sync = 1;
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -1225,33 +1284,88 @@ static unsigned long wakeup_gran(struct sched_entity *se)
         * More easily preempt - nice tasks, while not making it harder for
         * + nice tasks.
         */
-        if (sched_feat(ASYM_GRAN))
+        if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
-                gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
+                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
 /*
+ * Should 'se' preempt 'curr'.
+ *
+ *             |s1
+ *        |s2
+ *   |s3
+ *         g
+ *      |<--->|c
+ *
+ *  w(c, s1) = -1
+ *  w(c, s2) =  0
+ *  w(c, s3) =  1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+        s64 gran, vdiff = curr->vruntime - se->vruntime;
+        if (vdiff <= 0)
+                return -1;
+        gran = wakeup_gran(curr);
+        if (vdiff > gran)
+                return 1;
+        return 0;
+}
+static void set_last_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->last = se;
+}
+static void set_next_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->next = se;
+}
+/*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
-        s64 delta_exec;
        if (unlikely(rt_prio(p->prio))) {
+                struct cfs_rq *cfs_rq = task_cfs_rq(curr);
                update_rq_clock(rq);
                update_curr(cfs_rq);
                resched_task(curr);
                return;
        }
+        if (unlikely(p->sched_class != &fair_sched_class))
+                return;
        if (unlikely(se == pse))
                return;
-        cfs_rq_of(pse)->next = pse;
+        /*
+         * Only set the backward buddy when the current task is still on the
+         * rq. This can happen when a wakeup gets interleaved with schedule on
+         * the ->pre_schedule() or idle_balance() point, either of which can
+         * drop the rq lock.
+         *
+         * Also, during early boot the idle thread is in the fair class, for
+         * obvious reasons its a bad idea to schedule back to the idle thread.
+         */
+        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+                set_last_buddy(se);
+        set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1277,9 +1391,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
                return;
        }
-        delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+        find_matching_se(&se, &pse);
-        if (delta_exec > wakeup_gran(pse))
-                resched_task(curr);
+        while (se) {
+                BUG_ON(!pse);
+                if (wakeup_preempt_entity(se, pse) == 1) {
+                        resched_task(curr);
+                        break;
+                }
+                se = parent_entity(se);
+                pse = parent_entity(pse);
+        }
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1293,6 +1417,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
+                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
@@ -1575,9 +1700,6 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_fair,
-#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_wakeup,
@@ -1585,6 +1707,8 @@ static const struct sched_class fair_sched_class = {
        .put_prev_task          = put_prev_task_fair,
 #ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_fair,
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
 #endif

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd17172eb6..51aa3e102acb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73		73
74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75		75
		76	static const struct sched_class fair_sched_class;
		77
76	/**************************************************************	78	/**************************************************************
77	* CFS operations on generic schedulable entities:	79	* CFS operations on generic schedulable entities:
78	*/	80	*/
@@ -141,6 +143,49 @@ static inline struct sched_entity parent_entity(struct sched_entity se)
141	return se->parent;	143	return se->parent;
142	}	144	}
143		145
		146	/* return depth at which a sched entity is present in the hierarchy */
		147	static inline int depth_se(struct sched_entity *se)
		148	{
		149	int depth = 0;
		150
		151	for_each_sched_entity(se)
		152	depth++;
		153
		154	return depth;
		155	}
		156
		157	static void
		158	find_matching_se(struct sched_entity se, struct sched_entity pse)
		159	{
		160	int se_depth, pse_depth;
		161
		162	/*
		163	* preemption test can be made between sibling entities who are in the
		164	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
		165	* both tasks until we find their ancestors who are siblings of common
		166	* parent.
		167	*/
		168
		169	/* First walk up until both entities are at same depth */
		170	se_depth = depth_se(*se);
		171	pse_depth = depth_se(*pse);
		172
		173	while (se_depth > pse_depth) {
		174	se_depth--;
		175	se = parent_entity(se);
		176	}
		177
		178	while (pse_depth > se_depth) {
		179	pse_depth--;
		180	pse = parent_entity(pse);
		181	}
		182
		183	while (!is_same_group(se, pse)) {
		184	se = parent_entity(se);
		185	pse = parent_entity(pse);
		186	}
		187	}
		188
144	#else /* CONFIG_FAIR_GROUP_SCHED */	189	#else /* CONFIG_FAIR_GROUP_SCHED */
145		190
146	static inline struct rq rq_of(struct cfs_rq cfs_rq)	191	static inline struct rq rq_of(struct cfs_rq cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity parent_entity(struct sched_entity se)
191	return NULL;	236	return NULL;
192	}	237	}
193		238
		239	static inline void
		240	find_matching_se(struct sched_entity se, struct sched_entity pse)
		241	{
		242	}
		243
194	#endif /* CONFIG_FAIR_GROUP_SCHED */	244	#endif /* CONFIG_FAIR_GROUP_SCHED */
195		245
196		246
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq cfs_rq, struct sched_entity se)
221	return se->vruntime - cfs_rq->min_vruntime;	271	return se->vruntime - cfs_rq->min_vruntime;
222	}	272	}
223		273
		274	static void update_min_vruntime(struct cfs_rq *cfs_rq)
		275	{
		276	u64 vruntime = cfs_rq->min_vruntime;
		277
		278	if (cfs_rq->curr)
		279	vruntime = cfs_rq->curr->vruntime;
		280
		281	if (cfs_rq->rb_leftmost) {
		282	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
		283	struct sched_entity,
		284	run_node);
		285
		286	if (vruntime == cfs_rq->min_vruntime)
		287	vruntime = se->vruntime;
		288	else
		289	vruntime = min_vruntime(vruntime, se->vruntime);
		290	}
		291
		292	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
		293	}
		294
224	/*	295	/*
225	* Enqueue an entity into the rb-tree:	296	* Enqueue an entity into the rb-tree:
226	*/	297	*/
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
254	* Maintain a cache of leftmost tree entries (it is frequently	325	* Maintain a cache of leftmost tree entries (it is frequently
255	* used):	326	* used):
256	*/	327	*/
257	if (leftmost) {	328	if (leftmost)
258	cfs_rq->rb_leftmost = &se->run_node;	329	cfs_rq->rb_leftmost = &se->run_node;
259	/*
260	* maintain cfs_rq->min_vruntime to be a monotonic increasing
261	* value tracking the leftmost vruntime in the tree.
262	*/
263	cfs_rq->min_vruntime =
264	max_vruntime(cfs_rq->min_vruntime, se->vruntime);
265	}
266		330
267	rb_link_node(&se->run_node, parent, link);	331	rb_link_node(&se->run_node, parent, link);
268	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);	332	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
272	{	336	{
273	if (cfs_rq->rb_leftmost == &se->run_node) {	337	if (cfs_rq->rb_leftmost == &se->run_node) {
274	struct rb_node *next_node;	338	struct rb_node *next_node;
275	struct sched_entity *next;
276		339
277	next_node = rb_next(&se->run_node);	340	next_node = rb_next(&se->run_node);
278	cfs_rq->rb_leftmost = next_node;	341	cfs_rq->rb_leftmost = next_node;
279
280	if (next_node) {
281	next = rb_entry(next_node,
282	struct sched_entity, run_node);
283	cfs_rq->min_vruntime =
284	max_vruntime(cfs_rq->min_vruntime,
285	next->vruntime);
286	}
287	}	342	}
288		343
289	if (cfs_rq->next == se)
290	cfs_rq->next = NULL;
291
292	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);	344	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
293	}	345	}
294		346
295	static inline struct rb_node first_fair(struct cfs_rq cfs_rq)
296	{
297	return cfs_rq->rb_leftmost;
298	}
299
300	static struct sched_entity __pick_next_entity(struct cfs_rq cfs_rq)	347	static struct sched_entity __pick_next_entity(struct cfs_rq cfs_rq)
301	{	348	{
302	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);	349	struct rb_node *left = cfs_rq->rb_leftmost;
		350
		351	if (!left)
		352	return NULL;
		353
		354	return rb_entry(left, struct sched_entity, run_node);
303	}	355	}
304		356
305	static inline struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)	357	static struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
306	{	358	{
307	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);	359	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
308		360
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334	#endif	386	#endif
335		387
336	/*	388	/*
337	* delta *= w / rw	389	* delta *= P[w / rw]
338	*/	390	*/
339	static inline unsigned long	391	static inline unsigned long
340	calc_delta_weight(unsigned long delta, struct sched_entity *se)	392	calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
348	}	400	}
349		401
350	/*	402	/*
351	* delta *= rw / w	403	* delta /= w
352	*/	404	*/
353	static inline unsigned long	405	static inline unsigned long
354	calc_delta_fair(unsigned long delta, struct sched_entity *se)	406	calc_delta_fair(unsigned long delta, struct sched_entity *se)
355	{	407	{
356	for_each_sched_entity(se) {	408	if (unlikely(se->load.weight != NICE_0_LOAD))
357	delta = calc_delta_mine(delta,	409	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
358	cfs_rq_of(se)->load.weight, &se->load);
359	}
360		410
361	return delta;	411	return delta;
362	}	412	}
@@ -386,26 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
386	* We calculate the wall-time slice from the period by taking a part	436	* We calculate the wall-time slice from the period by taking a part
387	* proportional to the weight.	437	* proportional to the weight.
388	*	438	*
389	* s = p*w/rw	439	* s = p*P[w/rw]
390	*/	440	*/
391	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)	441	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
392	{	442	{
393	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);	443	unsigned long nr_running = cfs_rq->nr_running;
		444
		445	if (unlikely(!se->on_rq))
		446	nr_running++;
		447
		448	return calc_delta_weight(__sched_period(nr_running), se);
394	}	449	}
395		450
396	/*	451	/*
397	* We calculate the vruntime slice of a to be inserted task	452	* We calculate the vruntime slice of a to be inserted task
398	*	453	*
399	* vs = s*rw/w = p	454	* vs = s/w
400	*/	455	*/
401	static u64 sched_vslice_add(struct cfs_rq cfs_rq, struct sched_entity se)	456	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
402	{	457	{
403	unsigned long nr_running = cfs_rq->nr_running;	458	return calc_delta_fair(sched_slice(cfs_rq, se), se);
404
405	if (!se->on_rq)
406	nr_running++;
407
408	return __sched_period(nr_running);
409	}	459	}
410		460
411	/*	461	/*
@@ -424,6 +474,7 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
424	schedstat_add(cfs_rq, exec_clock, delta_exec);	474	schedstat_add(cfs_rq, exec_clock, delta_exec);
425	delta_exec_weighted = calc_delta_fair(delta_exec, curr);	475	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
426	curr->vruntime += delta_exec_weighted;	476	curr->vruntime += delta_exec_weighted;
		477	update_min_vruntime(cfs_rq);
427	}	478	}
428		479
429	static void update_curr(struct cfs_rq *cfs_rq)	480	static void update_curr(struct cfs_rq *cfs_rq)
@@ -449,6 +500,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
449	struct task_struct *curtask = task_of(curr);	500	struct task_struct *curtask = task_of(curr);
450		501
451	cpuacct_charge(curtask, delta_exec);	502	cpuacct_charge(curtask, delta_exec);
		503	account_group_exec_runtime(curtask, delta_exec);
452	}	504	}
453	}	505	}
454		506
@@ -612,13 +664,7 @@ static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
612	static void	664	static void
613	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)	665	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
614	{	666	{
615	u64 vruntime;	667	u64 vruntime = cfs_rq->min_vruntime;
616
617	if (first_fair(cfs_rq)) {
618	vruntime = min_vruntime(cfs_rq->min_vruntime,
619	__pick_next_entity(cfs_rq)->vruntime);
620	} else
621	vruntime = cfs_rq->min_vruntime;
622		668
623	/*	669	/*
624	* The 'current' period is already promised to the current tasks,	670	* The 'current' period is already promised to the current tasks,
@@ -627,7 +673,7 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
627	* stays open at the end.	673	* stays open at the end.
628	*/	674	*/
629	if (initial && sched_feat(START_DEBIT))	675	if (initial && sched_feat(START_DEBIT))
630	vruntime += sched_vslice_add(cfs_rq, se);	676	vruntime += sched_vslice(cfs_rq, se);
631		677
632	if (!initial) {	678	if (!initial) {
633	/* sleeps upto a single latency don't count. */	679	/* sleeps upto a single latency don't count. */
@@ -692,9 +738,16 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int sleep)
692	#endif	738	#endif
693	}	739	}
694		740
		741	if (cfs_rq->last == se)
		742	cfs_rq->last = NULL;
		743
		744	if (cfs_rq->next == se)
		745	cfs_rq->next = NULL;
		746
695	if (se != cfs_rq->curr)	747	if (se != cfs_rq->curr)
696	__dequeue_entity(cfs_rq, se);	748	__dequeue_entity(cfs_rq, se);
697	account_entity_dequeue(cfs_rq, se);	749	account_entity_dequeue(cfs_rq, se);
		750	update_min_vruntime(cfs_rq);
698	}	751	}
699		752
700	/*	753	/*
@@ -741,29 +794,18 @@ set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
741	se->prev_sum_exec_runtime = se->sum_exec_runtime;	794	se->prev_sum_exec_runtime = se->sum_exec_runtime;
742	}	795	}
743		796
744	static struct sched_entity *	797	static int
745	pick_next(struct cfs_rq cfs_rq, struct sched_entity se)	798	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
746	{
747	struct rq *rq = rq_of(cfs_rq);
748	u64 pair_slice = rq->clock - cfs_rq->pair_start;
749
750	if (!cfs_rq->next \|\| pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
751	cfs_rq->pair_start = rq->clock;
752	return se;
753	}
754
755	return cfs_rq->next;
756	}
757		799
758	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)	800	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
759	{	801	{
760	struct sched_entity *se = NULL;	802	struct sched_entity *se = __pick_next_entity(cfs_rq);
761		803
762	if (first_fair(cfs_rq)) {	804	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
763	se = __pick_next_entity(cfs_rq);	805	return cfs_rq->next;
764	se = pick_next(cfs_rq, se);	806
765	set_next_entity(cfs_rq, se);	807	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
766	}	808	return cfs_rq->last;
767		809
768	return se;	810	return se;
769	}	811	}
@@ -848,11 +890,31 @@ static void hrtick_start_fair(struct rq rq, struct task_struct p)
848	hrtick_start(rq, delta);	890	hrtick_start(rq, delta);
849	}	891	}
850	}	892	}
		893
		894	/*
		895	* called from enqueue/dequeue and updates the hrtick when the
		896	* current task is from our class and nr_running is low enough
		897	* to matter.
		898	*/
		899	static void hrtick_update(struct rq *rq)
		900	{
		901	struct task_struct *curr = rq->curr;
		902
		903	if (curr->sched_class != &fair_sched_class)
		904	return;
		905
		906	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
		907	hrtick_start_fair(rq, curr);
		908	}
851	#else /* !CONFIG_SCHED_HRTICK */	909	#else /* !CONFIG_SCHED_HRTICK */
852	static inline void	910	static inline void
853	hrtick_start_fair(struct rq rq, struct task_struct p)	911	hrtick_start_fair(struct rq rq, struct task_struct p)
854	{	912	{
855	}	913	}
		914
		915	static inline void hrtick_update(struct rq *rq)
		916	{
		917	}
856	#endif	918	#endif
857		919
858	/*	920	/*
@@ -873,7 +935,7 @@ static void enqueue_task_fair(struct rq rq, struct task_struct p, int wakeup)
873	wakeup = 1;	935	wakeup = 1;
874	}	936	}
875		937
876	hrtick_start_fair(rq, rq->curr);	938	hrtick_update(rq);
877	}	939	}
878		940
879	/*	941	/*
@@ -895,7 +957,7 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int sleep)
895	sleep = 1;	957	sleep = 1;
896	}	958	}
897		959
898	hrtick_start_fair(rq, rq->curr);	960	hrtick_update(rq);
899	}	961	}
900		962
901	/*	963	/*
@@ -1001,8 +1063,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1001		1063
1002	#ifdef CONFIG_SMP	1064	#ifdef CONFIG_SMP
1003		1065
1004	static const struct sched_class fair_sched_class;
1005
1006	#ifdef CONFIG_FAIR_GROUP_SCHED	1066	#ifdef CONFIG_FAIR_GROUP_SCHED
1007	/*	1067	/*
1008	* effective_load() calculates the load change as seen from the root_task_group	1068	* effective_load() calculates the load change as seen from the root_task_group
@@ -1103,10 +1163,9 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1103	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))	1163	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))
1104	return 0;	1164	return 0;
1105		1165
1106	if (!sync && sched_feat(SYNC_WAKEUPS) &&	1166	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost \|\|
1107	curr->se.avg_overlap < sysctl_sched_migration_cost &&	1167	p->se.avg_overlap > sysctl_sched_migration_cost))
1108	p->se.avg_overlap < sysctl_sched_migration_cost)	1168	sync = 0;
1109	sync = 1;
1110		1169
1111	/*	1170	/*
1112	* If sync wakeup then subtract the (maximum possible)	1171	* If sync wakeup then subtract the (maximum possible)
@@ -1225,33 +1284,88 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1225	* More easily preempt - nice tasks, while not making it harder for	1284	* More easily preempt - nice tasks, while not making it harder for
1226	* + nice tasks.	1285	* + nice tasks.
1227	*/	1286	*/
1228	if (sched_feat(ASYM_GRAN))	1287	if (!sched_feat(ASYM_GRAN) \|\| se->load.weight > NICE_0_LOAD)
1229	gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);	1288	gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1230		1289
1231	return gran;	1290	return gran;
1232	}	1291	}
1233		1292
1234	/*	1293	/*
		1294	* Should 'se' preempt 'curr'.
		1295	*
		1296	* \|s1
		1297	* \|s2
		1298	* \|s3
		1299	* g
		1300	* \|<--->\|c
		1301	*
		1302	* w(c, s1) = -1
		1303	* w(c, s2) = 0
		1304	* w(c, s3) = 1
		1305	*
		1306	*/
		1307	static int
		1308	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
		1309	{
		1310	s64 gran, vdiff = curr->vruntime - se->vruntime;
		1311
		1312	if (vdiff <= 0)
		1313	return -1;
		1314
		1315	gran = wakeup_gran(curr);
		1316	if (vdiff > gran)
		1317	return 1;
		1318
		1319	return 0;
		1320	}
		1321
		1322	static void set_last_buddy(struct sched_entity *se)
		1323	{
		1324	for_each_sched_entity(se)
		1325	cfs_rq_of(se)->last = se;
		1326	}
		1327
		1328	static void set_next_buddy(struct sched_entity *se)
		1329	{
		1330	for_each_sched_entity(se)
		1331	cfs_rq_of(se)->next = se;
		1332	}
		1333
		1334	/*
1235	* Preempt the current task with a newly woken task if needed:	1335	* Preempt the current task with a newly woken task if needed:
1236	*/	1336	*/
1237	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)	1337	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1238	{	1338	{
1239	struct task_struct *curr = rq->curr;	1339	struct task_struct *curr = rq->curr;
1240	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1241	struct sched_entity se = &curr->se, pse = &p->se;	1340	struct sched_entity se = &curr->se, pse = &p->se;
1242	s64 delta_exec;
1243		1341
1244	if (unlikely(rt_prio(p->prio))) {	1342	if (unlikely(rt_prio(p->prio))) {
		1343	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
		1344
1245	update_rq_clock(rq);	1345	update_rq_clock(rq);
1246	update_curr(cfs_rq);	1346	update_curr(cfs_rq);
1247	resched_task(curr);	1347	resched_task(curr);
1248	return;	1348	return;
1249	}	1349	}
1250		1350
		1351	if (unlikely(p->sched_class != &fair_sched_class))
		1352	return;
		1353
1251	if (unlikely(se == pse))	1354	if (unlikely(se == pse))
1252	return;	1355	return;
1253		1356
1254	cfs_rq_of(pse)->next = pse;	1357	/*
		1358	* Only set the backward buddy when the current task is still on the
		1359	* rq. This can happen when a wakeup gets interleaved with schedule on
		1360	* the ->pre_schedule() or idle_balance() point, either of which can
		1361	* drop the rq lock.
		1362	*
		1363	* Also, during early boot the idle thread is in the fair class, for
		1364	* obvious reasons its a bad idea to schedule back to the idle thread.
		1365	*/
		1366	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
		1367	set_last_buddy(se);
		1368	set_next_buddy(pse);
1255		1369
1256	/*	1370	/*
1257	* We can come here with TIF_NEED_RESCHED already set from new task	1371	* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1277,9 +1391,19 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1277	return;	1391	return;
1278	}	1392	}
1279		1393
1280	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;	1394	find_matching_se(&se, &pse);
1281	if (delta_exec > wakeup_gran(pse))	1395
1282	resched_task(curr);	1396	while (se) {
		1397	BUG_ON(!pse);
		1398
		1399	if (wakeup_preempt_entity(se, pse) == 1) {
		1400	resched_task(curr);
		1401	break;
		1402	}
		1403
		1404	se = parent_entity(se);
		1405	pse = parent_entity(pse);
		1406	}
1283	}	1407	}
1284		1408
1285	static struct task_struct pick_next_task_fair(struct rq rq)	1409	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1293,6 +1417,7 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1293		1417
1294	do {	1418	do {
1295	se = pick_next_entity(cfs_rq);	1419	se = pick_next_entity(cfs_rq);
		1420	set_next_entity(cfs_rq, se);
1296	cfs_rq = group_cfs_rq(se);	1421	cfs_rq = group_cfs_rq(se);
1297	} while (cfs_rq);	1422	} while (cfs_rq);
1298		1423
@@ -1575,9 +1700,6 @@ static const struct sched_class fair_sched_class = {
1575	.enqueue_task = enqueue_task_fair,	1700	.enqueue_task = enqueue_task_fair,
1576	.dequeue_task = dequeue_task_fair,	1701	.dequeue_task = dequeue_task_fair,
1577	.yield_task = yield_task_fair,	1702	.yield_task = yield_task_fair,
1578	#ifdef CONFIG_SMP
1579	.select_task_rq = select_task_rq_fair,
1580	#endif /* CONFIG_SMP */
1581		1703
1582	.check_preempt_curr = check_preempt_wakeup,	1704	.check_preempt_curr = check_preempt_wakeup,
1583		1705
@@ -1585,6 +1707,8 @@ static const struct sched_class fair_sched_class = {
1585	.put_prev_task = put_prev_task_fair,	1707	.put_prev_task = put_prev_task_fair,
1586		1708
1587	#ifdef CONFIG_SMP	1709	#ifdef CONFIG_SMP
		1710	.select_task_rq = select_task_rq_fair,
		1711
1588	.load_balance = load_balance_fair,	1712	.load_balance = load_balance_fair,
1589	.move_one_task = move_one_task_fair,	1713	.move_one_task = move_one_task_fair,
1590	#endif	1714	#endif