Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: fix documentation reference for sched_min_granularity_ns sched: virtual time buddy preemption sched: re-instate vruntime based wakeup preemption sched: weaken sync hint sched: more accurate min_vruntime accounting sched: fix a find_busiest_group buglet sched: add CONFIG_SMP consistency
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-10-28 12:46:20 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-10-28 12:46:20 -0400
commit: 8ca6215502462f564d7bcae2d8dcc825aa95d743 (patch)
tree: 1534f8ad77640ab6f6d9471679b6e4c2d11e739c
parent: f8245e91a5121acc435e509aa56cd04d445a74c7 (diff)
parent: 4078e359c4688541a0093fde0dff35dc7190c4f5 (diff)
6 files changed, 139 insertions, 57 deletions
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 9d8eb553884c..eb471c7a905e 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -92,7 +92,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
 way the previous scheduler had, and has no heuristics whatsoever.  There is
 only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
-   /proc/sys/kernel/sched_granularity_ns
+   /proc/sys/kernel/sched_min_granularity_ns
 which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
 "server" (i.e., good batching) workloads.  It defaults to a setting suitable
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8478f334d732..b483f39a7112 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -936,7 +936,6 @@ struct sched_class {
        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
        void (*yield_task) (struct rq *rq);
-        int  (*select_task_rq)(struct task_struct *p, int sync);
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
@@ -944,6 +943,8 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
+        int  (*select_task_rq)(struct task_struct *p, int sync);
        unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
                        struct rq *busiest, unsigned long max_load_move,
                        struct sched_domain *sd, enum cpu_idle_type idle,
@@ -955,16 +956,17 @@ struct sched_class {
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
-#endif
-        void (*set_curr_task) (struct rq *rq);
-        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
-        void (*task_new) (struct rq *rq, struct task_struct *p);
        void (*set_cpus_allowed)(struct task_struct *p,
                                 const cpumask_t *newmask);
        void (*rq_online)(struct rq *rq);
        void (*rq_offline)(struct rq *rq);
+#endif
+        void (*set_curr_task) (struct rq *rq);
+        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+        void (*task_new) (struct rq *rq, struct task_struct *p);
        void (*switched_from) (struct rq *this_rq, struct task_struct *task,
                               int running);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6625c3c4b10d..e8819bc6f462 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -386,7 +386,6 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
-        u64 pair_start;
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -3344,7 +3343,7 @@ small_imbalance:
                } else
                        this_load_per_task = cpu_avg_load_per_task(this_cpu);
-                if (max_load - this_load + 2*busiest_load_per_task >=
+                if (max_load - this_load + busiest_load_per_task >=
                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9573c33688b8..ce514afd78ff 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -143,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+        int depth = 0;
+        for_each_sched_entity(se)
+                depth++;
+        return depth;
+}
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+        int se_depth, pse_depth;
+        /*
+         * preemption test can be made between sibling entities who are in the
+         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+         * both tasks until we find their ancestors who are siblings of common
+         * parent.
+         */
+        /* First walk up until both entities are at same depth */
+        se_depth = depth_se(*se);
+        pse_depth = depth_se(*pse);
+        while (se_depth > pse_depth) {
+                se_depth--;
+                *se = parent_entity(*se);
+        }
+        while (pse_depth > se_depth) {
+                pse_depth--;
+                *pse = parent_entity(*pse);
+        }
+        while (!is_same_group(*se, *pse)) {
+                *se = parent_entity(*se);
+                *pse = parent_entity(*pse);
+        }
+}
 #else   /* CONFIG_FAIR_GROUP_SCHED */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -193,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return NULL;
 }
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
@@ -223,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return se->vruntime - cfs_rq->min_vruntime;
 }
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+        u64 vruntime = cfs_rq->min_vruntime;
+        if (cfs_rq->curr)
+                vruntime = cfs_rq->curr->vruntime;
+        if (cfs_rq->rb_leftmost) {
+                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+                                                   struct sched_entity,
+                                                   run_node);
+                if (vruntime == cfs_rq->min_vruntime)
+                        vruntime = se->vruntime;
+                else
+                        vruntime = min_vruntime(vruntime, se->vruntime);
+        }
+        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
 /*
 * Enqueue an entity into the rb-tree:
 */
@@ -256,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * Maintain a cache of leftmost tree entries (it is frequently
         * used):
         */
-        if (leftmost) {
+        if (leftmost)
                cfs_rq->rb_leftmost = &se->run_node;
-                /*
-                 * maintain cfs_rq->min_vruntime to be a monotonic increasing
-                 * value tracking the leftmost vruntime in the tree.
-                 */
-                cfs_rq->min_vruntime =
-                        max_vruntime(cfs_rq->min_vruntime, se->vruntime);
-        }
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -274,18 +336,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        if (cfs_rq->rb_leftmost == &se->run_node) {
                struct rb_node *next_node;
-                struct sched_entity *next;
                next_node = rb_next(&se->run_node);
                cfs_rq->rb_leftmost = next_node;
-                if (next_node) {
-                        next = rb_entry(next_node,
-                                        struct sched_entity, run_node);
-                        cfs_rq->min_vruntime =
-                                max_vruntime(cfs_rq->min_vruntime,
-                                             next->vruntime);
-                }
        }
        if (cfs_rq->next == se)
@@ -424,6 +477,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        schedstat_add(cfs_rq, exec_clock, delta_exec);
        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
+        update_min_vruntime(cfs_rq);
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -613,13 +667,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-        u64 vruntime;
+        u64 vruntime = cfs_rq->min_vruntime;
-        if (first_fair(cfs_rq)) {
-                vruntime = min_vruntime(cfs_rq->min_vruntime,
-                                __pick_next_entity(cfs_rq)->vruntime);
-        } else
-                vruntime = cfs_rq->min_vruntime;
        /*
         * The 'current' period is already promised to the current tasks,
@@ -696,6 +744,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
+        update_min_vruntime(cfs_rq);
 }
 /*
@@ -742,16 +791,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        struct rq *rq = rq_of(cfs_rq);
+        if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
-        u64 pair_slice = rq->clock - cfs_rq->pair_start;
-        if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) {
-                cfs_rq->pair_start = rq->clock;
                return se;
-        }
        return cfs_rq->next;
 }
@@ -1122,10 +1169,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
-        if (!sync && sched_feat(SYNC_WAKEUPS) &&
+        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-            curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                        p->se.avg_overlap > sysctl_sched_migration_cost))
-            p->se.avg_overlap < sysctl_sched_migration_cost)
+                sync = 0;
-                sync = 1;
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -1244,13 +1290,42 @@ static unsigned long wakeup_gran(struct sched_entity *se)
         * More easily preempt - nice tasks, while not making it harder for
         * + nice tasks.
         */
-        if (sched_feat(ASYM_GRAN))
+        if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
-                gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
+                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
 /*
+ * Should 'se' preempt 'curr'.
+ *
+ *             |s1
+ *        |s2
+ *   |s3
+ *         g
+ *      |<--->|c
+ *
+ *  w(c, s1) = -1
+ *  w(c, s2) =  0
+ *  w(c, s3) =  1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+        s64 gran, vdiff = curr->vruntime - se->vruntime;
+        if (vdiff <= 0)
+                return -1;
+        gran = wakeup_gran(curr);
+        if (vdiff > gran)
+                return 1;
+        return 0;
+}
+/*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
@@ -1258,7 +1333,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
        struct task_struct *curr = rq->curr;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
-        s64 delta_exec;
        if (unlikely(rt_prio(p->prio))) {
                update_rq_clock(rq);
@@ -1296,9 +1370,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
                return;
        }
-        delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+        find_matching_se(&se, &pse);
-        if (delta_exec > wakeup_gran(pse))
-                resched_task(curr);
+        while (se) {
+                BUG_ON(!pse);
+                if (wakeup_preempt_entity(se, pse) == 1) {
+                        resched_task(curr);
+                        break;
+                }
+                se = parent_entity(se);
+                pse = parent_entity(pse);
+        }
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1594,9 +1678,6 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_fair,
-#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_wakeup,
@@ -1604,6 +1685,8 @@ static const struct sched_class fair_sched_class = {
        .put_prev_task          = put_prev_task_fair,
 #ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_fair,
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
 #endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4ccabe2f5..8a21a2e28c13 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
        /* dequeue is not valid, we print a debug message there: */
        .dequeue_task           = dequeue_task_idle,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_idle,
-#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_curr_idle,
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
        .put_prev_task          = put_prev_task_idle,
 #ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_idle,
        .load_balance           = load_balance_idle,
        .move_one_task          = move_one_task_idle,
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b446dc87494f..d9ba9d5f99d6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1504,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
        .enqueue_task           = enqueue_task_rt,
        .dequeue_task           = dequeue_task_rt,
        .yield_task             = yield_task_rt,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_rt,
-#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_curr_rt,
@@ -1514,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
        .put_prev_task          = put_prev_task_rt,
 #ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_rt,
        .load_balance           = load_balance_rt,
        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-10-28 12:46:20 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-10-28 12:46:20 -0400
commit	8ca6215502462f564d7bcae2d8dcc825aa95d743 (patch)
tree	1534f8ad77640ab6f6d9471679b6e4c2d11e739c
parent	f8245e91a5121acc435e509aa56cd04d445a74c7 (diff)
parent	4078e359c4688541a0093fde0dff35dc7190c4f5 (diff)

diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 9d8eb553884c..eb471c7a905e 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -92,7 +92,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
92	way the previous scheduler had, and has no heuristics whatsoever. There is	92	way the previous scheduler had, and has no heuristics whatsoever. There is
93	only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):	93	only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
94		94
95	/proc/sys/kernel/sched_granularity_ns	95	/proc/sys/kernel/sched_min_granularity_ns
96		96
97	which can be used to tune the scheduler from "desktop" (i.e., low latencies) to	97	which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
98	"server" (i.e., good batching) workloads. It defaults to a setting suitable	98	"server" (i.e., good batching) workloads. It defaults to a setting suitable


diff --git a/include/linux/sched.h b/include/linux/sched.h index 8478f334d732..b483f39a7112 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -936,7 +936,6 @@ struct sched_class {
936	void (enqueue_task) (struct rq rq, struct task_struct *p, int wakeup);	936	void (enqueue_task) (struct rq rq, struct task_struct *p, int wakeup);
937	void (dequeue_task) (struct rq rq, struct task_struct *p, int sleep);	937	void (dequeue_task) (struct rq rq, struct task_struct *p, int sleep);
938	void (yield_task) (struct rq rq);	938	void (yield_task) (struct rq rq);
939	int (select_task_rq)(struct task_struct p, int sync);
940		939
941	void (check_preempt_curr) (struct rq rq, struct task_struct *p, int sync);	940	void (check_preempt_curr) (struct rq rq, struct task_struct *p, int sync);
942		941
@@ -944,6 +943,8 @@ struct sched_class {
944	void (put_prev_task) (struct rq rq, struct task_struct *p);	943	void (put_prev_task) (struct rq rq, struct task_struct *p);
945		944
946	#ifdef CONFIG_SMP	945	#ifdef CONFIG_SMP
		946	int (select_task_rq)(struct task_struct p, int sync);
		947
947	unsigned long (load_balance) (struct rq this_rq, int this_cpu,	948	unsigned long (load_balance) (struct rq this_rq, int this_cpu,
948	struct rq *busiest, unsigned long max_load_move,	949	struct rq *busiest, unsigned long max_load_move,
949	struct sched_domain *sd, enum cpu_idle_type idle,	950	struct sched_domain *sd, enum cpu_idle_type idle,
@@ -955,16 +956,17 @@ struct sched_class {
955	void (pre_schedule) (struct rq this_rq, struct task_struct *task);	956	void (pre_schedule) (struct rq this_rq, struct task_struct *task);
956	void (post_schedule) (struct rq this_rq);	957	void (post_schedule) (struct rq this_rq);
957	void (task_wake_up) (struct rq this_rq, struct task_struct *task);	958	void (task_wake_up) (struct rq this_rq, struct task_struct *task);
958	#endif
959		959
960	void (set_curr_task) (struct rq rq);
961	void (task_tick) (struct rq rq, struct task_struct *p, int queued);
962	void (task_new) (struct rq rq, struct task_struct *p);
963	void (set_cpus_allowed)(struct task_struct p,	960	void (set_cpus_allowed)(struct task_struct p,
964	const cpumask_t *newmask);	961	const cpumask_t *newmask);
965		962
966	void (rq_online)(struct rq rq);	963	void (rq_online)(struct rq rq);
967	void (rq_offline)(struct rq rq);	964	void (rq_offline)(struct rq rq);
		965	#endif
		966
		967	void (set_curr_task) (struct rq rq);
		968	void (task_tick) (struct rq rq, struct task_struct *p, int queued);
		969	void (task_new) (struct rq rq, struct task_struct *p);
968		970
969	void (switched_from) (struct rq this_rq, struct task_struct *task,	971	void (switched_from) (struct rq this_rq, struct task_struct *task,
970	int running);	972	int running);


diff --git a/kernel/sched.c b/kernel/sched.c index 6625c3c4b10d..e8819bc6f462 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -386,7 +386,6 @@ struct cfs_rq {
386		386
387	u64 exec_clock;	387	u64 exec_clock;
388	u64 min_vruntime;	388	u64 min_vruntime;
389	u64 pair_start;
390		389
391	struct rb_root tasks_timeline;	390	struct rb_root tasks_timeline;
392	struct rb_node *rb_leftmost;	391	struct rb_node *rb_leftmost;
@@ -3344,7 +3343,7 @@ small_imbalance:
3344	} else	3343	} else
3345	this_load_per_task = cpu_avg_load_per_task(this_cpu);	3344	this_load_per_task = cpu_avg_load_per_task(this_cpu);
3346		3345
3347	if (max_load - this_load + 2*busiest_load_per_task >=	3346	if (max_load - this_load + busiest_load_per_task >=
3348	busiest_load_per_task * imbn) {	3347	busiest_load_per_task * imbn) {
3349	*imbalance = busiest_load_per_task;	3348	*imbalance = busiest_load_per_task;
3350	return busiest;	3349	return busiest;


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9573c33688b8..ce514afd78ff 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -143,6 +143,49 @@ static inline struct sched_entity parent_entity(struct sched_entity se)
143	return se->parent;	143	return se->parent;
144	}	144	}
145		145
		146	/* return depth at which a sched entity is present in the hierarchy */
		147	static inline int depth_se(struct sched_entity *se)
		148	{
		149	int depth = 0;
		150
		151	for_each_sched_entity(se)
		152	depth++;
		153
		154	return depth;
		155	}
		156
		157	static void
		158	find_matching_se(struct sched_entity se, struct sched_entity pse)
		159	{
		160	int se_depth, pse_depth;
		161
		162	/*
		163	* preemption test can be made between sibling entities who are in the
		164	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
		165	* both tasks until we find their ancestors who are siblings of common
		166	* parent.
		167	*/
		168
		169	/* First walk up until both entities are at same depth */
		170	se_depth = depth_se(*se);
		171	pse_depth = depth_se(*pse);
		172
		173	while (se_depth > pse_depth) {
		174	se_depth--;
		175	se = parent_entity(se);
		176	}
		177
		178	while (pse_depth > se_depth) {
		179	pse_depth--;
		180	pse = parent_entity(pse);
		181	}
		182
		183	while (!is_same_group(se, pse)) {
		184	se = parent_entity(se);
		185	pse = parent_entity(pse);
		186	}
		187	}
		188
146	#else /* CONFIG_FAIR_GROUP_SCHED */	189	#else /* CONFIG_FAIR_GROUP_SCHED */
147		190
148	static inline struct rq rq_of(struct cfs_rq cfs_rq)	191	static inline struct rq rq_of(struct cfs_rq cfs_rq)
@@ -193,6 +236,11 @@ static inline struct sched_entity parent_entity(struct sched_entity se)
193	return NULL;	236	return NULL;
194	}	237	}
195		238
		239	static inline void
		240	find_matching_se(struct sched_entity se, struct sched_entity pse)
		241	{
		242	}
		243
196	#endif /* CONFIG_FAIR_GROUP_SCHED */	244	#endif /* CONFIG_FAIR_GROUP_SCHED */
197		245
198		246
@@ -223,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq cfs_rq, struct sched_entity se)
223	return se->vruntime - cfs_rq->min_vruntime;	271	return se->vruntime - cfs_rq->min_vruntime;
224	}	272	}
225		273
		274	static void update_min_vruntime(struct cfs_rq *cfs_rq)
		275	{
		276	u64 vruntime = cfs_rq->min_vruntime;
		277
		278	if (cfs_rq->curr)
		279	vruntime = cfs_rq->curr->vruntime;
		280
		281	if (cfs_rq->rb_leftmost) {
		282	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
		283	struct sched_entity,
		284	run_node);
		285
		286	if (vruntime == cfs_rq->min_vruntime)
		287	vruntime = se->vruntime;
		288	else
		289	vruntime = min_vruntime(vruntime, se->vruntime);
		290	}
		291
		292	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
		293	}
		294
226	/*	295	/*
227	* Enqueue an entity into the rb-tree:	296	* Enqueue an entity into the rb-tree:
228	*/	297	*/
@@ -256,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
256	* Maintain a cache of leftmost tree entries (it is frequently	325	* Maintain a cache of leftmost tree entries (it is frequently
257	* used):	326	* used):
258	*/	327	*/
259	if (leftmost) {	328	if (leftmost)
260	cfs_rq->rb_leftmost = &se->run_node;	329	cfs_rq->rb_leftmost = &se->run_node;
261	/*
262	* maintain cfs_rq->min_vruntime to be a monotonic increasing
263	* value tracking the leftmost vruntime in the tree.
264	*/
265	cfs_rq->min_vruntime =
266	max_vruntime(cfs_rq->min_vruntime, se->vruntime);
267	}
268		330
269	rb_link_node(&se->run_node, parent, link);	331	rb_link_node(&se->run_node, parent, link);
270	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);	332	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -274,18 +336,9 @@ static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
274	{	336	{
275	if (cfs_rq->rb_leftmost == &se->run_node) {	337	if (cfs_rq->rb_leftmost == &se->run_node) {
276	struct rb_node *next_node;	338	struct rb_node *next_node;
277	struct sched_entity *next;
278		339
279	next_node = rb_next(&se->run_node);	340	next_node = rb_next(&se->run_node);
280	cfs_rq->rb_leftmost = next_node;	341	cfs_rq->rb_leftmost = next_node;
281
282	if (next_node) {
283	next = rb_entry(next_node,
284	struct sched_entity, run_node);
285	cfs_rq->min_vruntime =
286	max_vruntime(cfs_rq->min_vruntime,
287	next->vruntime);
288	}
289	}	342	}
290		343
291	if (cfs_rq->next == se)	344	if (cfs_rq->next == se)
@@ -424,6 +477,7 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
424	schedstat_add(cfs_rq, exec_clock, delta_exec);	477	schedstat_add(cfs_rq, exec_clock, delta_exec);
425	delta_exec_weighted = calc_delta_fair(delta_exec, curr);	478	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
426	curr->vruntime += delta_exec_weighted;	479	curr->vruntime += delta_exec_weighted;
		480	update_min_vruntime(cfs_rq);
427	}	481	}
428		482
429	static void update_curr(struct cfs_rq *cfs_rq)	483	static void update_curr(struct cfs_rq *cfs_rq)
@@ -613,13 +667,7 @@ static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
613	static void	667	static void
614	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)	668	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
615	{	669	{
616	u64 vruntime;	670	u64 vruntime = cfs_rq->min_vruntime;
617
618	if (first_fair(cfs_rq)) {
619	vruntime = min_vruntime(cfs_rq->min_vruntime,
620	__pick_next_entity(cfs_rq)->vruntime);
621	} else
622	vruntime = cfs_rq->min_vruntime;
623		671
624	/*	672	/*
625	* The 'current' period is already promised to the current tasks,	673	* The 'current' period is already promised to the current tasks,
@@ -696,6 +744,7 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int sleep)
696	if (se != cfs_rq->curr)	744	if (se != cfs_rq->curr)
697	__dequeue_entity(cfs_rq, se);	745	__dequeue_entity(cfs_rq, se);
698	account_entity_dequeue(cfs_rq, se);	746	account_entity_dequeue(cfs_rq, se);
		747	update_min_vruntime(cfs_rq);
699	}	748	}
700		749
701	/*	750	/*
@@ -742,16 +791,14 @@ set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
742	se->prev_sum_exec_runtime = se->sum_exec_runtime;	791	se->prev_sum_exec_runtime = se->sum_exec_runtime;
743	}	792	}
744		793
		794	static int
		795	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
		796
745	static struct sched_entity *	797	static struct sched_entity *
746	pick_next(struct cfs_rq cfs_rq, struct sched_entity se)	798	pick_next(struct cfs_rq cfs_rq, struct sched_entity se)
747	{	799	{
748	struct rq *rq = rq_of(cfs_rq);	800	if (!cfs_rq->next \|\| wakeup_preempt_entity(cfs_rq->next, se) == 1)
749	u64 pair_slice = rq->clock - cfs_rq->pair_start;
750
751	if (!cfs_rq->next \|\| pair_slice > sysctl_sched_min_granularity) {
752	cfs_rq->pair_start = rq->clock;
753	return se;	801	return se;
754	}
755		802
756	return cfs_rq->next;	803	return cfs_rq->next;
757	}	804	}
@@ -1122,10 +1169,9 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1122	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))	1169	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))
1123	return 0;	1170	return 0;
1124		1171
1125	if (!sync && sched_feat(SYNC_WAKEUPS) &&	1172	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost \|\|
1126	curr->se.avg_overlap < sysctl_sched_migration_cost &&	1173	p->se.avg_overlap > sysctl_sched_migration_cost))
1127	p->se.avg_overlap < sysctl_sched_migration_cost)	1174	sync = 0;
1128	sync = 1;
1129		1175
1130	/*	1176	/*
1131	* If sync wakeup then subtract the (maximum possible)	1177	* If sync wakeup then subtract the (maximum possible)
@@ -1244,13 +1290,42 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1244	* More easily preempt - nice tasks, while not making it harder for	1290	* More easily preempt - nice tasks, while not making it harder for
1245	* + nice tasks.	1291	* + nice tasks.
1246	*/	1292	*/
1247	if (sched_feat(ASYM_GRAN))	1293	if (!sched_feat(ASYM_GRAN) \|\| se->load.weight > NICE_0_LOAD)
1248	gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);	1294	gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1249		1295
1250	return gran;	1296	return gran;
1251	}	1297	}
1252		1298
1253	/*	1299	/*
		1300	* Should 'se' preempt 'curr'.
		1301	*
		1302	* \|s1
		1303	* \|s2
		1304	* \|s3
		1305	* g
		1306	* \|<--->\|c
		1307	*
		1308	* w(c, s1) = -1
		1309	* w(c, s2) = 0
		1310	* w(c, s3) = 1
		1311	*
		1312	*/
		1313	static int
		1314	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
		1315	{
		1316	s64 gran, vdiff = curr->vruntime - se->vruntime;
		1317
		1318	if (vdiff <= 0)
		1319	return -1;
		1320
		1321	gran = wakeup_gran(curr);
		1322	if (vdiff > gran)
		1323	return 1;
		1324
		1325	return 0;
		1326	}
		1327
		1328	/*
1254	* Preempt the current task with a newly woken task if needed:	1329	* Preempt the current task with a newly woken task if needed:
1255	*/	1330	*/
1256	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)	1331	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
@@ -1258,7 +1333,6 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1258	struct task_struct *curr = rq->curr;	1333	struct task_struct *curr = rq->curr;
1259	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1334	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1260	struct sched_entity se = &curr->se, pse = &p->se;	1335	struct sched_entity se = &curr->se, pse = &p->se;
1261	s64 delta_exec;
1262		1336
1263	if (unlikely(rt_prio(p->prio))) {	1337	if (unlikely(rt_prio(p->prio))) {
1264	update_rq_clock(rq);	1338	update_rq_clock(rq);
@@ -1296,9 +1370,19 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1296	return;	1370	return;
1297	}	1371	}
1298		1372
1299	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;	1373	find_matching_se(&se, &pse);
1300	if (delta_exec > wakeup_gran(pse))	1374
1301	resched_task(curr);	1375	while (se) {
		1376	BUG_ON(!pse);
		1377
		1378	if (wakeup_preempt_entity(se, pse) == 1) {
		1379	resched_task(curr);
		1380	break;
		1381	}
		1382
		1383	se = parent_entity(se);
		1384	pse = parent_entity(pse);
		1385	}
1302	}	1386	}
1303		1387
1304	static struct task_struct pick_next_task_fair(struct rq rq)	1388	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1594,9 +1678,6 @@ static const struct sched_class fair_sched_class = {
1594	.enqueue_task = enqueue_task_fair,	1678	.enqueue_task = enqueue_task_fair,
1595	.dequeue_task = dequeue_task_fair,	1679	.dequeue_task = dequeue_task_fair,
1596	.yield_task = yield_task_fair,	1680	.yield_task = yield_task_fair,
1597	#ifdef CONFIG_SMP
1598	.select_task_rq = select_task_rq_fair,
1599	#endif /* CONFIG_SMP */
1600		1681
1601	.check_preempt_curr = check_preempt_wakeup,	1682	.check_preempt_curr = check_preempt_wakeup,
1602		1683
@@ -1604,6 +1685,8 @@ static const struct sched_class fair_sched_class = {
1604	.put_prev_task = put_prev_task_fair,	1685	.put_prev_task = put_prev_task_fair,
1605		1686
1606	#ifdef CONFIG_SMP	1687	#ifdef CONFIG_SMP
		1688	.select_task_rq = select_task_rq_fair,
		1689
1607	.load_balance = load_balance_fair,	1690	.load_balance = load_balance_fair,
1608	.move_one_task = move_one_task_fair,	1691	.move_one_task = move_one_task_fair,
1609	#endif	1692	#endif


diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index dec4ccabe2f5..8a21a2e28c13 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
105		105
106	/* dequeue is not valid, we print a debug message there: */	106	/* dequeue is not valid, we print a debug message there: */
107	.dequeue_task = dequeue_task_idle,	107	.dequeue_task = dequeue_task_idle,
108	#ifdef CONFIG_SMP
109	.select_task_rq = select_task_rq_idle,
110	#endif /* CONFIG_SMP */
111		108
112	.check_preempt_curr = check_preempt_curr_idle,	109	.check_preempt_curr = check_preempt_curr_idle,
113		110
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
115	.put_prev_task = put_prev_task_idle,	112	.put_prev_task = put_prev_task_idle,
116		113
117	#ifdef CONFIG_SMP	114	#ifdef CONFIG_SMP
		115	.select_task_rq = select_task_rq_idle,
		116
118	.load_balance = load_balance_idle,	117	.load_balance = load_balance_idle,
119	.move_one_task = move_one_task_idle,	118	.move_one_task = move_one_task_idle,
120	#endif	119	#endif


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b446dc87494f..d9ba9d5f99d6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -1504,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
1504	.enqueue_task = enqueue_task_rt,	1504	.enqueue_task = enqueue_task_rt,
1505	.dequeue_task = dequeue_task_rt,	1505	.dequeue_task = dequeue_task_rt,
1506	.yield_task = yield_task_rt,	1506	.yield_task = yield_task_rt,
1507	#ifdef CONFIG_SMP
1508	.select_task_rq = select_task_rq_rt,
1509	#endif /* CONFIG_SMP */
1510		1507
1511	.check_preempt_curr = check_preempt_curr_rt,	1508	.check_preempt_curr = check_preempt_curr_rt,
1512		1509
@@ -1514,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
1514	.put_prev_task = put_prev_task_rt,	1511	.put_prev_task = put_prev_task_rt,
1515		1512
1516	#ifdef CONFIG_SMP	1513	#ifdef CONFIG_SMP
		1514	.select_task_rq = select_task_rq_rt,
		1515
1517	.load_balance = load_balance_rt,	1516	.load_balance = load_balance_rt,
1518	.move_one_task = move_one_task_rt,	1517	.move_one_task = move_one_task_rt,
1519	.set_cpus_allowed = set_cpus_allowed_rt,	1518	.set_cpus_allowed = set_cpus_allowed_rt,