sched: Replace update_shares weight distribution with per-entity computation

Now that the machinery in place is in place to compute contributed load in a bottom up fashion; replace the shares distribution code within update_shares() accordingly. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Ben Segall <bsegall@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20120823141507.061208672@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Paul Turner <pjt@google.com> 2012-10-04 07:18:31 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-10-24 04:27:28 -0400
commit: 82958366cfea1a50e7e90907b2d55ae29ed69974 (patch)
tree: ffd8ed3800e17ccc3f051e1ce6c8b25dde7bf906 /kernel/sched
parent: f1b17280efbd21873d1db8631117bdbccbcb39a2 (diff)
3 files changed, 36 insertions, 165 deletions
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 71b0ea325e93..2cd3c1b4e582 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -218,14 +218,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
-                        SPLIT_NS(cfs_rq->load_avg));
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
-                        SPLIT_NS(cfs_rq->load_period));
-        SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
-                        cfs_rq->load_contribution);
-        SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-                        atomic_read(&cfs_rq->tg->load_weight));
        SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
                        cfs_rq->runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 873c9f5c5796..57fae95eed99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -658,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -680,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-        cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -806,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-                                            int global_update)
-{
-        struct task_group *tg = cfs_rq->tg;
-        long load_avg;
-        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-        load_avg -= cfs_rq->load_contribution;
-        if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-                atomic_add(load_avg, &tg->load_weight);
-                cfs_rq->load_contribution += load_avg;
-        }
-}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-        u64 period = sysctl_sched_shares_window;
-        u64 now, delta;
-        unsigned long load = cfs_rq->load.weight;
-        if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-                return;
-        now = rq_of(cfs_rq)->clock_task;
-        delta = now - cfs_rq->load_stamp;
-        /* truncate load history at 4 idle periods */
-        if (cfs_rq->load_stamp > cfs_rq->load_last &&
-            now - cfs_rq->load_last > 4 * period) {
-                cfs_rq->load_period = 0;
-                cfs_rq->load_avg = 0;
-                delta = period - 1;
-        }
-        cfs_rq->load_stamp = now;
-        cfs_rq->load_unacc_exec_time = 0;
-        cfs_rq->load_period += delta;
-        if (load) {
-                cfs_rq->load_last = now;
-                cfs_rq->load_avg += delta * load;
-        }
-        /* consider updating load contribution on each fold or truncate */
-        if (global_update || cfs_rq->load_period > period
-            || !cfs_rq->load_period)
-                update_cfs_rq_load_contribution(cfs_rq, global_update);
-        while (cfs_rq->load_period > period) {
-                /*
-                 * Inline assembly required to prevent the compiler
-                 * optimising this loop into a divmod call.
-                 * See __iter_div_u64_rem() for another example of this.
-                 */
-                asm("" : "+rm" (cfs_rq->load_period));
-                cfs_rq->load_period /= 2;
-                cfs_rq->load_avg /= 2;
-        }
-        if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
-                list_del_leaf_cfs_rq(cfs_rq);
-}
 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 {
        long tg_weight;
@@ -881,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
         * to gain a more accurate current total weight. See
         * update_cfs_rq_load_contribution().
         */
-        tg_weight = atomic_read(&tg->load_weight);
+        tg_weight = atomic64_read(&tg->load_avg);
-        tg_weight -= cfs_rq->load_contribution;
+        tg_weight -= cfs_rq->tg_load_contrib;
        tg_weight += cfs_rq->load.weight;
        return tg_weight;
@@ -906,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
        return shares;
 }
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-                update_cfs_load(cfs_rq, 0);
-                update_cfs_shares(cfs_rq);
-        }
-}
 # else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
        return tg->shares;
 }
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
@@ -944,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                account_entity_enqueue(cfs_rq, se);
 }
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
        struct task_group *tg;
@@ -963,17 +877,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
        reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_SMP
@@ -1490,7 +1396,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
-        update_cfs_load(cfs_rq, 0);
        enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -1587,7 +1492,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        se->on_rq = 0;
-        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        /*
@@ -1756,11 +1660,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
        update_entity_load_avg(curr, 1);
        update_cfs_rq_blocked_load(cfs_rq, 1);
-        /*
-         * Update share accounting for long-running entities.
-         */
-        update_entity_shares_tick(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
@@ -2005,18 +1904,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
        cfs_rq->throttle_count--;
 #ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
-                u64 delta = rq->clock_task - cfs_rq->load_stamp;
-                /* leaving throttled state, advance shares averaging windows */
-                cfs_rq->load_stamp += delta;
-                cfs_rq->load_last += delta;
                /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq->clock_task -
                                             cfs_rq->throttled_clock_task;
-                /* update entity weight now that we are on_rq again */
-                update_cfs_shares(cfs_rq);
        }
 #endif
@@ -2028,11 +1918,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
        struct rq *rq = data;
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-        /* group is entering throttled state, record last load */
+        /* group is entering throttled state, stop time */
-        if (!cfs_rq->throttle_count) {
+        if (!cfs_rq->throttle_count)
-                update_cfs_load(cfs_rq, 0);
                cfs_rq->throttled_clock_task = rq->clock_task;
-        }
        cfs_rq->throttle_count++;
        return 0;
@@ -2630,7 +2518,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
-                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
                update_entity_load_avg(se, 1);
        }
@@ -2692,7 +2579,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
-                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
                update_entity_load_avg(se, 1);
        }
@@ -3755,27 +3641,36 @@ next:
 */
 static int update_shares_cpu(struct task_group *tg, int cpu)
 {
+        struct sched_entity *se;
        struct cfs_rq *cfs_rq;
        unsigned long flags;
        struct rq *rq;
-        if (!tg->se[cpu])
-                return 0;
        rq = cpu_rq(cpu);
+        se = tg->se[cpu];
        cfs_rq = tg->cfs_rq[cpu];
        raw_spin_lock_irqsave(&rq->lock, flags);
        update_rq_clock(rq);
-        update_cfs_load(cfs_rq, 1);
        update_cfs_rq_blocked_load(cfs_rq, 1);
-        /*
+        if (se) {
-         * We need to update shares after updating tg->load_weight in
+                update_entity_load_avg(se, 1);
-         * order to adjust the weight of groups with long running tasks.
+                /*
-         */
+                 * We pivot on our runnable average having decayed to zero for
-        update_cfs_shares(cfs_rq);
+                 * list removal.  This generally implies that all our children
+                 * have also been removed (modulo rounding error or bandwidth
+                 * control); however, such cases are rare and we can fix these
+                 * at enqueue.
+                 *
+                 * TODO: fix up out-of-order children on enqueue.
+                 */
+                if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+                        list_del_leaf_cfs_rq(cfs_rq);
+        } else {
+                update_rq_runnable_avg(rq, rq->nr_running);
+        }
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5702,10 +5597,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        cfs_rq->tg = tg;
        cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-        /* allow initial update_cfs_load() to truncate */
-        cfs_rq->load_stamp = 1;
-#endif
        init_cfs_rq_runtime(cfs_rq);
        tg->cfs_rq[cpu] = cfs_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d13bce7a44ef..0a75a430ca77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -234,11 +234,21 @@ struct cfs_rq {
        u64 runnable_load_avg, blocked_load_avg;
        atomic64_t decay_counter, removed_load;
        u64 last_decay;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        u32 tg_runnable_contrib;
        u64 tg_load_contrib;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif
+        /*
+         *   h_load = weight * f(tg)
+         *
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
+#endif /* CONFIG_SMP */
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -254,28 +264,6 @@ struct cfs_rq {
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
-#ifdef CONFIG_SMP
-        /*
-         *   h_load = weight * f(tg)
-         *
-         * Where f(tg) is the recursive weight fraction assigned to
-         * this group.
-         */
-        unsigned long h_load;
-        /*
-         * Maintaining per-cpu shares distribution for group scheduling
-         *
-         * load_stamp is the last time we updated the load average
-         * load_last is the last time we updated the load average and saw load
-         * load_unacc_exec_time is currently unaccounted execution time
-         */
-        u64 load_avg;
-        u64 load_period;
-        u64 load_stamp, load_last, load_unacc_exec_time;
-        unsigned long load_contribution;
-#endif /* CONFIG_SMP */
 #ifdef CONFIG_CFS_BANDWIDTH
        int runtime_enabled;
        u64 runtime_expires;
author	Paul Turner <pjt@google.com>	2012-10-04 07:18:31 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-10-24 04:27:28 -0400
commit	82958366cfea1a50e7e90907b2d55ae29ed69974 (patch)
tree	ffd8ed3800e17ccc3f051e1ce6c8b25dde7bf906 /kernel/sched
parent	f1b17280efbd21873d1db8631117bdbccbcb39a2 (diff)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 71b0ea325e93..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c
@@ -218,14 +218,6 @@ void print_cfs_rq(struct seq_file m, int cpu, struct cfs_rq cfs_rq)
218	SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);	218	SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
219	#ifdef CONFIG_FAIR_GROUP_SCHED	219	#ifdef CONFIG_FAIR_GROUP_SCHED
220	#ifdef CONFIG_SMP	220	#ifdef CONFIG_SMP
221	SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
222	SPLIT_NS(cfs_rq->load_avg));
223	SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
224	SPLIT_NS(cfs_rq->load_period));
225	SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
226	cfs_rq->load_contribution);
227	SEQ_printf(m, " .%-30s: %d\n", "load_tg",
228	atomic_read(&cfs_rq->tg->load_weight));
229	SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",	221	SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
230	cfs_rq->runnable_load_avg);	222	cfs_rq->runnable_load_avg);
231	SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",	223	SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 873c9f5c5796..57fae95eed99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -658,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
658	return calc_delta_fair(sched_slice(cfs_rq, se), se);	658	return calc_delta_fair(sched_slice(cfs_rq, se), se);
659	}	659	}
660		660
661	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
662	static void update_cfs_shares(struct cfs_rq *cfs_rq);
663
664	/*	661	/*
665	* Update the current task's runtime statistics. Skip current tasks that	662	* Update the current task's runtime statistics. Skip current tasks that
666	* are not in our scheduling class.	663	* are not in our scheduling class.
@@ -680,10 +677,6 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
680		677
681	curr->vruntime += delta_exec_weighted;	678	curr->vruntime += delta_exec_weighted;
682	update_min_vruntime(cfs_rq);	679	update_min_vruntime(cfs_rq);
683
684	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
685	cfs_rq->load_unacc_exec_time += delta_exec;
686	#endif
687	}	680	}
688		681
689	static void update_curr(struct cfs_rq *cfs_rq)	682	static void update_curr(struct cfs_rq *cfs_rq)
@@ -806,72 +799,7 @@ account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
806	}	799	}
807		800
808	#ifdef CONFIG_FAIR_GROUP_SCHED	801	#ifdef CONFIG_FAIR_GROUP_SCHED
809	/* we need this in update_cfs_load and load-balance functions below */
810	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
811	# ifdef CONFIG_SMP	802	# ifdef CONFIG_SMP
812	static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
813	int global_update)
814	{
815	struct task_group *tg = cfs_rq->tg;
816	long load_avg;
817
818	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
819	load_avg -= cfs_rq->load_contribution;
820
821	if (global_update \|\| abs(load_avg) > cfs_rq->load_contribution / 8) {
822	atomic_add(load_avg, &tg->load_weight);
823	cfs_rq->load_contribution += load_avg;
824	}
825	}
826
827	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
828	{
829	u64 period = sysctl_sched_shares_window;
830	u64 now, delta;
831	unsigned long load = cfs_rq->load.weight;
832
833	if (cfs_rq->tg == &root_task_group \|\| throttled_hierarchy(cfs_rq))
834	return;
835
836	now = rq_of(cfs_rq)->clock_task;
837	delta = now - cfs_rq->load_stamp;
838
839	/* truncate load history at 4 idle periods */
840	if (cfs_rq->load_stamp > cfs_rq->load_last &&
841	now - cfs_rq->load_last > 4 * period) {
842	cfs_rq->load_period = 0;
843	cfs_rq->load_avg = 0;
844	delta = period - 1;
845	}
846
847	cfs_rq->load_stamp = now;
848	cfs_rq->load_unacc_exec_time = 0;
849	cfs_rq->load_period += delta;
850	if (load) {
851	cfs_rq->load_last = now;
852	cfs_rq->load_avg += delta * load;
853	}
854
855	/* consider updating load contribution on each fold or truncate */
856	if (global_update \|\| cfs_rq->load_period > period
857	\|\| !cfs_rq->load_period)
858	update_cfs_rq_load_contribution(cfs_rq, global_update);
859
860	while (cfs_rq->load_period > period) {
861	/*
862	* Inline assembly required to prevent the compiler
863	* optimising this loop into a divmod call.
864	* See __iter_div_u64_rem() for another example of this.
865	*/
866	asm("" : "+rm" (cfs_rq->load_period));
867	cfs_rq->load_period /= 2;
868	cfs_rq->load_avg /= 2;
869	}
870
871	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
872	list_del_leaf_cfs_rq(cfs_rq);
873	}
874
875	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)	803	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
876	{	804	{
877	long tg_weight;	805	long tg_weight;
@@ -881,8 +809,8 @@ static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
881	* to gain a more accurate current total weight. See	809	* to gain a more accurate current total weight. See
882	* update_cfs_rq_load_contribution().	810	* update_cfs_rq_load_contribution().
883	*/	811	*/
884	tg_weight = atomic_read(&tg->load_weight);	812	tg_weight = atomic64_read(&tg->load_avg);
885	tg_weight -= cfs_rq->load_contribution;	813	tg_weight -= cfs_rq->tg_load_contrib;
886	tg_weight += cfs_rq->load.weight;	814	tg_weight += cfs_rq->load.weight;
887		815
888	return tg_weight;	816	return tg_weight;
@@ -906,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
906		834
907	return shares;	835	return shares;
908	}	836	}
909
910	static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
911	{
912	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
913	update_cfs_load(cfs_rq, 0);
914	update_cfs_shares(cfs_rq);
915	}
916	}
917	# else /* CONFIG_SMP */	837	# else /* CONFIG_SMP */
918	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
919	{
920	}
921
922	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)	838	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
923	{	839	{
924	return tg->shares;	840	return tg->shares;
925	}	841	}
926
927	static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
928	{
929	}
930	# endif /* CONFIG_SMP */	842	# endif /* CONFIG_SMP */
931	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,	843	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
932	unsigned long weight)	844	unsigned long weight)
@@ -944,6 +856,8 @@ static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
944	account_entity_enqueue(cfs_rq, se);	856	account_entity_enqueue(cfs_rq, se);
945	}	857	}
946		858
		859	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
		860
947	static void update_cfs_shares(struct cfs_rq *cfs_rq)	861	static void update_cfs_shares(struct cfs_rq *cfs_rq)
948	{	862	{
949	struct task_group *tg;	863	struct task_group *tg;
@@ -963,17 +877,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
963	reweight_entity(cfs_rq_of(se), se, shares);	877	reweight_entity(cfs_rq_of(se), se, shares);
964	}	878	}
965	#else /* CONFIG_FAIR_GROUP_SCHED */	879	#else /* CONFIG_FAIR_GROUP_SCHED */
966	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
967	{
968	}
969
970	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)	880	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
971	{	881	{
972	}	882	}
973
974	static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
975	{
976	}
977	#endif /* CONFIG_FAIR_GROUP_SCHED */	883	#endif /* CONFIG_FAIR_GROUP_SCHED */
978		884
979	#ifdef CONFIG_SMP	885	#ifdef CONFIG_SMP
@@ -1490,7 +1396,6 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
1490	* Update run-time statistics of the 'current'.	1396	* Update run-time statistics of the 'current'.
1491	*/	1397	*/
1492	update_curr(cfs_rq);	1398	update_curr(cfs_rq);
1493	update_cfs_load(cfs_rq, 0);
1494	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);	1399	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1495	account_entity_enqueue(cfs_rq, se);	1400	account_entity_enqueue(cfs_rq, se);
1496	update_cfs_shares(cfs_rq);	1401	update_cfs_shares(cfs_rq);
@@ -1587,7 +1492,6 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
1587	if (se != cfs_rq->curr)	1492	if (se != cfs_rq->curr)
1588	__dequeue_entity(cfs_rq, se);	1493	__dequeue_entity(cfs_rq, se);
1589	se->on_rq = 0;	1494	se->on_rq = 0;
1590	update_cfs_load(cfs_rq, 0);
1591	account_entity_dequeue(cfs_rq, se);	1495	account_entity_dequeue(cfs_rq, se);
1592		1496
1593	/*	1497	/*
@@ -1756,11 +1660,6 @@ entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
1756	update_entity_load_avg(curr, 1);	1660	update_entity_load_avg(curr, 1);
1757	update_cfs_rq_blocked_load(cfs_rq, 1);	1661	update_cfs_rq_blocked_load(cfs_rq, 1);
1758		1662
1759	/*
1760	* Update share accounting for long-running entities.
1761	*/
1762	update_entity_shares_tick(cfs_rq);
1763
1764	#ifdef CONFIG_SCHED_HRTICK	1663	#ifdef CONFIG_SCHED_HRTICK
1765	/*	1664	/*
1766	* queued ticks are scheduled to match the slice, so don't bother	1665	* queued ticks are scheduled to match the slice, so don't bother
@@ -2005,18 +1904,9 @@ static int tg_unthrottle_up(struct task_group tg, void data)
2005	cfs_rq->throttle_count--;	1904	cfs_rq->throttle_count--;
2006	#ifdef CONFIG_SMP	1905	#ifdef CONFIG_SMP
2007	if (!cfs_rq->throttle_count) {	1906	if (!cfs_rq->throttle_count) {
2008	u64 delta = rq->clock_task - cfs_rq->load_stamp;
2009
2010	/* leaving throttled state, advance shares averaging windows */
2011	cfs_rq->load_stamp += delta;
2012	cfs_rq->load_last += delta;
2013
2014	/* adjust cfs_rq_clock_task() */	1907	/* adjust cfs_rq_clock_task() */
2015	cfs_rq->throttled_clock_task_time += rq->clock_task -	1908	cfs_rq->throttled_clock_task_time += rq->clock_task -
2016	cfs_rq->throttled_clock_task;	1909	cfs_rq->throttled_clock_task;
2017
2018	/* update entity weight now that we are on_rq again */
2019	update_cfs_shares(cfs_rq);
2020	}	1910	}
2021	#endif	1911	#endif
2022		1912
@@ -2028,11 +1918,9 @@ static int tg_throttle_down(struct task_group tg, void data)
2028	struct rq *rq = data;	1918	struct rq *rq = data;
2029	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];	1919	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2030		1920
2031	/* group is entering throttled state, record last load */	1921	/* group is entering throttled state, stop time */
2032	if (!cfs_rq->throttle_count) {	1922	if (!cfs_rq->throttle_count)
2033	update_cfs_load(cfs_rq, 0);
2034	cfs_rq->throttled_clock_task = rq->clock_task;	1923	cfs_rq->throttled_clock_task = rq->clock_task;
2035	}
2036	cfs_rq->throttle_count++;	1924	cfs_rq->throttle_count++;
2037		1925
2038	return 0;	1926	return 0;
@@ -2630,7 +2518,6 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
2630	if (cfs_rq_throttled(cfs_rq))	2518	if (cfs_rq_throttled(cfs_rq))
2631	break;	2519	break;
2632		2520
2633	update_cfs_load(cfs_rq, 0);
2634	update_cfs_shares(cfs_rq);	2521	update_cfs_shares(cfs_rq);
2635	update_entity_load_avg(se, 1);	2522	update_entity_load_avg(se, 1);
2636	}	2523	}
@@ -2692,7 +2579,6 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
2692	if (cfs_rq_throttled(cfs_rq))	2579	if (cfs_rq_throttled(cfs_rq))
2693	break;	2580	break;
2694		2581
2695	update_cfs_load(cfs_rq, 0);
2696	update_cfs_shares(cfs_rq);	2582	update_cfs_shares(cfs_rq);
2697	update_entity_load_avg(se, 1);	2583	update_entity_load_avg(se, 1);
2698	}	2584	}
@@ -3755,27 +3641,36 @@ next:
3755	*/	3641	*/
3756	static int update_shares_cpu(struct task_group *tg, int cpu)	3642	static int update_shares_cpu(struct task_group *tg, int cpu)
3757	{	3643	{
		3644	struct sched_entity *se;
3758	struct cfs_rq *cfs_rq;	3645	struct cfs_rq *cfs_rq;
3759	unsigned long flags;	3646	unsigned long flags;
3760	struct rq *rq;	3647	struct rq *rq;
3761		3648
3762	if (!tg->se[cpu])
3763	return 0;
3764
3765	rq = cpu_rq(cpu);	3649	rq = cpu_rq(cpu);
		3650	se = tg->se[cpu];
3766	cfs_rq = tg->cfs_rq[cpu];	3651	cfs_rq = tg->cfs_rq[cpu];
3767		3652
3768	raw_spin_lock_irqsave(&rq->lock, flags);	3653	raw_spin_lock_irqsave(&rq->lock, flags);
3769		3654
3770	update_rq_clock(rq);	3655	update_rq_clock(rq);
3771	update_cfs_load(cfs_rq, 1);
3772	update_cfs_rq_blocked_load(cfs_rq, 1);	3656	update_cfs_rq_blocked_load(cfs_rq, 1);
3773		3657
3774	/*	3658	if (se) {
3775	* We need to update shares after updating tg->load_weight in	3659	update_entity_load_avg(se, 1);
3776	* order to adjust the weight of groups with long running tasks.	3660	/*
3777	*/	3661	* We pivot on our runnable average having decayed to zero for
3778	update_cfs_shares(cfs_rq);	3662	* list removal. This generally implies that all our children
		3663	* have also been removed (modulo rounding error or bandwidth
		3664	* control); however, such cases are rare and we can fix these
		3665	* at enqueue.
		3666	*
		3667	* TODO: fix up out-of-order children on enqueue.
		3668	*/
		3669	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
		3670	list_del_leaf_cfs_rq(cfs_rq);
		3671	} else {
		3672	update_rq_runnable_avg(rq, rq->nr_running);
		3673	}
3779		3674
3780	raw_spin_unlock_irqrestore(&rq->lock, flags);	3675	raw_spin_unlock_irqrestore(&rq->lock, flags);
3781		3676
@@ -5702,10 +5597,6 @@ void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
5702		5597
5703	cfs_rq->tg = tg;	5598	cfs_rq->tg = tg;
5704	cfs_rq->rq = rq;	5599	cfs_rq->rq = rq;
5705	#ifdef CONFIG_SMP
5706	/* allow initial update_cfs_load() to truncate */
5707	cfs_rq->load_stamp = 1;
5708	#endif
5709	init_cfs_rq_runtime(cfs_rq);	5600	init_cfs_rq_runtime(cfs_rq);
5710		5601
5711	tg->cfs_rq[cpu] = cfs_rq;	5602	tg->cfs_rq[cpu] = cfs_rq;


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d13bce7a44ef..0a75a430ca77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -234,11 +234,21 @@ struct cfs_rq {
234	u64 runnable_load_avg, blocked_load_avg;	234	u64 runnable_load_avg, blocked_load_avg;
235	atomic64_t decay_counter, removed_load;	235	atomic64_t decay_counter, removed_load;
236	u64 last_decay;	236	u64 last_decay;
		237
237	#ifdef CONFIG_FAIR_GROUP_SCHED	238	#ifdef CONFIG_FAIR_GROUP_SCHED
238	u32 tg_runnable_contrib;	239	u32 tg_runnable_contrib;
239	u64 tg_load_contrib;	240	u64 tg_load_contrib;
240	#endif	241	#endif /* CONFIG_FAIR_GROUP_SCHED */
241	#endif	242
		243	/*
		244	* h_load = weight * f(tg)
		245	*
		246	* Where f(tg) is the recursive weight fraction assigned to
		247	* this group.
		248	*/
		249	unsigned long h_load;
		250	#endif /* CONFIG_SMP */
		251
242	#ifdef CONFIG_FAIR_GROUP_SCHED	252	#ifdef CONFIG_FAIR_GROUP_SCHED
243	struct rq rq; / cpu runqueue to which this cfs_rq is attached */	253	struct rq rq; / cpu runqueue to which this cfs_rq is attached */
244		254
@@ -254,28 +264,6 @@ struct cfs_rq {
254	struct list_head leaf_cfs_rq_list;	264	struct list_head leaf_cfs_rq_list;
255	struct task_group tg; / group that "owns" this runqueue */	265	struct task_group tg; / group that "owns" this runqueue */
256		266
257	#ifdef CONFIG_SMP
258	/*
259	* h_load = weight * f(tg)
260	*
261	* Where f(tg) is the recursive weight fraction assigned to
262	* this group.
263	*/
264	unsigned long h_load;
265
266	/*
267	* Maintaining per-cpu shares distribution for group scheduling
268	*
269	* load_stamp is the last time we updated the load average
270	* load_last is the last time we updated the load average and saw load
271	* load_unacc_exec_time is currently unaccounted execution time
272	*/
273	u64 load_avg;
274	u64 load_period;
275	u64 load_stamp, load_last, load_unacc_exec_time;
276
277	unsigned long load_contribution;
278	#endif /* CONFIG_SMP */
279	#ifdef CONFIG_CFS_BANDWIDTH	267	#ifdef CONFIG_CFS_BANDWIDTH
280	int runtime_enabled;	268	int runtime_enabled;
281	u64 runtime_expires;	269	u64 runtime_expires;