1 files changed, 248 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd768667..c88671718bc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return cfs_rq->tg->cfs_rq[this_cpu];
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->on_list) {
+                /*
+                 * Ensure we either appear before our parent (if already
+                 * enqueued) or force our parent to appear after us when it is
+                 * enqueued.  The fact that we always enqueue bottom-up
+                 * reduces this to two cases.
+                 */
+                if (cfs_rq->tg->parent &&
+                    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                        list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                } else {
+                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                }
+                cfs_rq->on_list = 1;
+        }
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->on_list) {
+                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                cfs_rq->on_list = 0;
+        }
+}
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return &cpu_rq(this_cpu)->cfs;
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
        WRT_SYSCTL(sched_min_granularity);
        WRT_SYSCTL(sched_latency);
        WRT_SYSCTL(sched_wakeup_granularity);
-        WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
        return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -514,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+        cfs_rq->load_unacc_exec_time += delta_exec;
+        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
+#endif
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +688,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_add(&se->group_node, &cfs_rq->tasks);
        }
        cfs_rq->nr_running++;
-        se->on_rq = 1;
 }
 static void
@@ -647,9 +701,124 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_del_init(&se->group_node);
        }
        cfs_rq->nr_running--;
-        se->on_rq = 0;
 }
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                            int global_update)
+{
+        struct task_group *tg = cfs_rq->tg;
+        long load_avg;
+        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+        load_avg -= cfs_rq->load_contribution;
+        if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+                atomic_add(load_avg, &tg->load_weight);
+                cfs_rq->load_contribution += load_avg;
+        }
+}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+        u64 period = sysctl_sched_shares_window;
+        u64 now, delta;
+        unsigned long load = cfs_rq->load.weight;
+        if (!cfs_rq)
+                return;
+        now = rq_of(cfs_rq)->clock;
+        delta = now - cfs_rq->load_stamp;
+        /* truncate load history at 4 idle periods */
+        if (cfs_rq->load_stamp > cfs_rq->load_last &&
+            now - cfs_rq->load_last > 4 * period) {
+                cfs_rq->load_period = 0;
+                cfs_rq->load_avg = 0;
+        }
+        cfs_rq->load_stamp = now;
+        cfs_rq->load_unacc_exec_time = 0;
+        cfs_rq->load_period += delta;
+        if (load) {
+                cfs_rq->load_last = now;
+                cfs_rq->load_avg += delta * load;
+        }
+        /* consider updating load contribution on each fold or truncate */
+        if (global_update || cfs_rq->load_period > period
+            || !cfs_rq->load_period)
+                update_cfs_rq_load_contribution(cfs_rq, global_update);
+        while (cfs_rq->load_period > period) {
+                /*
+                 * Inline assembly required to prevent the compiler
+                 * optimising this loop into a divmod call.
+                 * See __iter_div_u64_rem() for another example of this.
+                 */
+                asm("" : "+rm" (cfs_rq->load_period));
+                cfs_rq->load_period /= 2;
+                cfs_rq->load_avg /= 2;
+        }
+        if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+                list_del_leaf_cfs_rq(cfs_rq);
+}
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                            unsigned long weight)
+{
+        if (se->on_rq)
+                account_entity_dequeue(cfs_rq, se);
+        update_load_set(&se->load, weight);
+        if (se->on_rq)
+                account_entity_enqueue(cfs_rq, se);
+}
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+        struct task_group *tg;
+        struct sched_entity *se;
+        long load_weight, load, shares;
+        if (!cfs_rq)
+                return;
+        tg = cfs_rq->tg;
+        se = tg->se[cpu_of(rq_of(cfs_rq))];
+        if (!se)
+                return;
+        load = cfs_rq->load.weight + weight_delta;
+        load_weight = atomic_read(&tg->load_weight);
+        load_weight -= cfs_rq->load_contribution;
+        load_weight += load;
+        shares = (tg->shares * load);
+        if (load_weight)
+                shares /= load_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        if (shares > tg->shares)
+                shares = tg->shares;
+        reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +940,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+        update_cfs_load(cfs_rq, 0);
+        update_cfs_shares(cfs_rq, se->load.weight);
        account_entity_enqueue(cfs_rq, se);
        if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +953,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
+        se->on_rq = 1;
+        if (cfs_rq->nr_running == 1)
+                list_add_leaf_cfs_rq(cfs_rq);
 }
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1000,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
+        se->on_rq = 0;
+        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+        update_cfs_shares(cfs_rq, 0);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1233,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                flags = ENQUEUE_WAKEUP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
        hrtick_update(rq);
 }
@@ -1071,12 +1256,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
                flags |= DEQUEUE_SLEEP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
        hrtick_update(rq);
 }
@@ -1143,51 +1336,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
 */
-static long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!tg->parent)
                return wl;
-        /*
-         * By not taking the decrease of shares on the other cpu into
-         * account our error leans towards reducing the affine wakeups.
-         */
-        if (!wl && sched_feat(ASYM_EFF_LOAD))
-                return wl;
        for_each_sched_entity(se) {
                long S, rw, s, a, b;
-                long more_w;
-                /*
-                 * Instead of using this increment, also add the difference
-                 * between when the shares were last updated and now.
-                 */
-                more_w = se->my_q->load.weight - se->my_q->rq_weight;
-                wl += more_w;
-                wg += more_w;
                S = se->my_q->tg->shares;
-                s = se->my_q->shares;
+                s = se->load.weight;
-                rw = se->my_q->rq_weight;
+                rw = se->my_q->load.weight;
                a = S*(rw + wl);
                b = S*rw + s*wg;
@@ -1508,23 +1670,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                        sd = tmp;
        }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (sched_feat(LB_SHARES_UPDATE)) {
-                /*
-                 * Pick the largest domain to update shares over
-                 */
-                tmp = sd;
-                if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                        tmp = affine_sd;
-                if (tmp) {
-                        raw_spin_unlock(&rq->lock);
-                        update_shares(tmp);
-                        raw_spin_lock(&rq->lock);
-                }
-        }
-#endif
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                        return select_idle_sibling(p, cpu);
@@ -1909,6 +2054,48 @@ out:
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        unsigned long flags;
+        struct rq *rq;
+        if (!tg->se[cpu])
+                return 0;
+        rq = cpu_rq(cpu);
+        cfs_rq = tg->cfs_rq[cpu];
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        update_rq_clock(rq);
+        update_cfs_load(cfs_rq, 1);
+        /*
+         * We need to update shares after updating tg->load_weight in
+         * order to adjust the weight of groups with long running tasks.
+         */
+        update_cfs_shares(cfs_rq, 0);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        return 0;
+}
+static void update_shares(int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        struct rq *rq = cpu_rq(cpu);
+        rcu_read_lock();
+        for_each_leaf_cfs_rq(rq, cfs_rq)
+                update_shares_cpu(cfs_rq->tg, cpu);
+        rcu_read_unlock();
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -1956,6 +2143,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -3032,7 +3223,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3174,8 +3364,6 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-        if (ld_moved)
-                update_shares(sd);
        return ld_moved;
 }
@@ -3199,6 +3387,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         */
        raw_spin_unlock(&this_rq->lock);
+        update_shares(this_cpu);
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3569,6 +3758,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
+        update_shares(cpu);
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 00ebd768667..c88671718bc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89		89
90	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	90	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91		91
		92	/*
		93	* The exponential sliding window over which load is averaged for shares
		94	* distribution.
		95	* (default: 10msec)
		96	*/
		97	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
		98
92	static const struct sched_class fair_sched_class;	99	static const struct sched_class fair_sched_class;
93		100
94	/**************************************************************	101	/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq cpu_cfs_rq(struct cfs_rq cfs_rq, int this_cpu)
143	return cfs_rq->tg->cfs_rq[this_cpu];	150	return cfs_rq->tg->cfs_rq[this_cpu];
144	}	151	}
145		152
		153	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
		154	{
		155	if (!cfs_rq->on_list) {
		156	/*
		157	* Ensure we either appear before our parent (if already
		158	* enqueued) or force our parent to appear after us when it is
		159	* enqueued. The fact that we always enqueue bottom-up
		160	* reduces this to two cases.
		161	*/
		162	if (cfs_rq->tg->parent &&
		163	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
		164	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
		165	&rq_of(cfs_rq)->leaf_cfs_rq_list);
		166	} else {
		167	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
		168	&rq_of(cfs_rq)->leaf_cfs_rq_list);
		169	}
		170
		171	cfs_rq->on_list = 1;
		172	}
		173	}
		174
		175	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
		176	{
		177	if (cfs_rq->on_list) {
		178	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
		179	cfs_rq->on_list = 0;
		180	}
		181	}
		182
146	/* Iterate thr' all leaf cfs_rq's on a runqueue */	183	/* Iterate thr' all leaf cfs_rq's on a runqueue */
147	#define for_each_leaf_cfs_rq(rq, cfs_rq) \	184	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)	185	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq cpu_cfs_rq(struct cfs_rq cfs_rq, int this_cpu)
246	return &cpu_rq(this_cpu)->cfs;	283	return &cpu_rq(this_cpu)->cfs;
247	}	284	}
248		285
		286	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
		287	{
		288	}
		289
		290	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
		291	{
		292	}
		293
249	#define for_each_leaf_cfs_rq(rq, cfs_rq) \	294	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)	295	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251		296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417	WRT_SYSCTL(sched_min_granularity);	462	WRT_SYSCTL(sched_min_granularity);
418	WRT_SYSCTL(sched_latency);	463	WRT_SYSCTL(sched_latency);
419	WRT_SYSCTL(sched_wakeup_granularity);	464	WRT_SYSCTL(sched_wakeup_granularity);
420	WRT_SYSCTL(sched_shares_ratelimit);
421	#undef WRT_SYSCTL	465	#undef WRT_SYSCTL
422		466
423	return 0;	467	return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
495	return calc_delta_fair(sched_slice(cfs_rq, se), se);	539	return calc_delta_fair(sched_slice(cfs_rq, se), se);
496	}	540	}
497		541
		542	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
		543	static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
		544
498	/*	545	/*
499	* Update the current task's runtime statistics. Skip current tasks that	546	* Update the current task's runtime statistics. Skip current tasks that
500	* are not in our scheduling class.	547	* are not in our scheduling class.
@@ -514,6 +561,14 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
514		561
515	curr->vruntime += delta_exec_weighted;	562	curr->vruntime += delta_exec_weighted;
516	update_min_vruntime(cfs_rq);	563	update_min_vruntime(cfs_rq);
		564
		565	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
		566	cfs_rq->load_unacc_exec_time += delta_exec;
		567	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
		568	update_cfs_load(cfs_rq, 0);
		569	update_cfs_shares(cfs_rq, 0);
		570	}
		571	#endif
517	}	572	}
518		573
519	static void update_curr(struct cfs_rq *cfs_rq)	574	static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +688,6 @@ account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
633	list_add(&se->group_node, &cfs_rq->tasks);	688	list_add(&se->group_node, &cfs_rq->tasks);
634	}	689	}
635	cfs_rq->nr_running++;	690	cfs_rq->nr_running++;
636	se->on_rq = 1;
637	}	691	}
638		692
639	static void	693	static void
@@ -647,9 +701,124 @@ account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
647	list_del_init(&se->group_node);	701	list_del_init(&se->group_node);
648	}	702	}
649	cfs_rq->nr_running--;	703	cfs_rq->nr_running--;
650	se->on_rq = 0;
651	}	704	}
652		705
		706	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
		707	static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
		708	int global_update)
		709	{
		710	struct task_group *tg = cfs_rq->tg;
		711	long load_avg;
		712
		713	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
		714	load_avg -= cfs_rq->load_contribution;
		715
		716	if (global_update \|\| abs(load_avg) > cfs_rq->load_contribution / 8) {
		717	atomic_add(load_avg, &tg->load_weight);
		718	cfs_rq->load_contribution += load_avg;
		719	}
		720	}
		721
		722	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
		723	{
		724	u64 period = sysctl_sched_shares_window;
		725	u64 now, delta;
		726	unsigned long load = cfs_rq->load.weight;
		727
		728	if (!cfs_rq)
		729	return;
		730
		731	now = rq_of(cfs_rq)->clock;
		732	delta = now - cfs_rq->load_stamp;
		733
		734	/* truncate load history at 4 idle periods */
		735	if (cfs_rq->load_stamp > cfs_rq->load_last &&
		736	now - cfs_rq->load_last > 4 * period) {
		737	cfs_rq->load_period = 0;
		738	cfs_rq->load_avg = 0;
		739	}
		740
		741	cfs_rq->load_stamp = now;
		742	cfs_rq->load_unacc_exec_time = 0;
		743	cfs_rq->load_period += delta;
		744	if (load) {
		745	cfs_rq->load_last = now;
		746	cfs_rq->load_avg += delta * load;
		747	}
		748
		749	/* consider updating load contribution on each fold or truncate */
		750	if (global_update \|\| cfs_rq->load_period > period
		751	\|\| !cfs_rq->load_period)
		752	update_cfs_rq_load_contribution(cfs_rq, global_update);
		753
		754	while (cfs_rq->load_period > period) {
		755	/*
		756	* Inline assembly required to prevent the compiler
		757	* optimising this loop into a divmod call.
		758	* See __iter_div_u64_rem() for another example of this.
		759	*/
		760	asm("" : "+rm" (cfs_rq->load_period));
		761	cfs_rq->load_period /= 2;
		762	cfs_rq->load_avg /= 2;
		763	}
		764
		765	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
		766	list_del_leaf_cfs_rq(cfs_rq);
		767	}
		768
		769	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
		770	unsigned long weight)
		771	{
		772	if (se->on_rq)
		773	account_entity_dequeue(cfs_rq, se);
		774
		775	update_load_set(&se->load, weight);
		776
		777	if (se->on_rq)
		778	account_entity_enqueue(cfs_rq, se);
		779	}
		780
		781	static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
		782	{
		783	struct task_group *tg;
		784	struct sched_entity *se;
		785	long load_weight, load, shares;
		786
		787	if (!cfs_rq)
		788	return;
		789
		790	tg = cfs_rq->tg;
		791	se = tg->se[cpu_of(rq_of(cfs_rq))];
		792	if (!se)
		793	return;
		794
		795	load = cfs_rq->load.weight + weight_delta;
		796
		797	load_weight = atomic_read(&tg->load_weight);
		798	load_weight -= cfs_rq->load_contribution;
		799	load_weight += load;
		800
		801	shares = (tg->shares * load);
		802	if (load_weight)
		803	shares /= load_weight;
		804
		805	if (shares < MIN_SHARES)
		806	shares = MIN_SHARES;
		807	if (shares > tg->shares)
		808	shares = tg->shares;
		809
		810	reweight_entity(cfs_rq_of(se), se, shares);
		811	}
		812	#else /* CONFIG_FAIR_GROUP_SCHED */
		813	static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
		814	{
		815	}
		816
		817	static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
		818	{
		819	}
		820	#endif /* CONFIG_FAIR_GROUP_SCHED */
		821
653	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)	822	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
654	{	823	{
655	#ifdef CONFIG_SCHEDSTATS	824	#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +940,8 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
771	* Update run-time statistics of the 'current'.	940	* Update run-time statistics of the 'current'.
772	*/	941	*/
773	update_curr(cfs_rq);	942	update_curr(cfs_rq);
		943	update_cfs_load(cfs_rq, 0);
		944	update_cfs_shares(cfs_rq, se->load.weight);
774	account_entity_enqueue(cfs_rq, se);	945	account_entity_enqueue(cfs_rq, se);
775		946
776	if (flags & ENQUEUE_WAKEUP) {	947	if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +953,10 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
782	check_spread(cfs_rq, se);	953	check_spread(cfs_rq, se);
783	if (se != cfs_rq->curr)	954	if (se != cfs_rq->curr)
784	__enqueue_entity(cfs_rq, se);	955	__enqueue_entity(cfs_rq, se);
		956	se->on_rq = 1;
		957
		958	if (cfs_rq->nr_running == 1)
		959	list_add_leaf_cfs_rq(cfs_rq);
785	}	960	}
786		961
787	static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)	962	static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
@@ -825,8 +1000,11 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
825		1000
826	if (se != cfs_rq->curr)	1001	if (se != cfs_rq->curr)
827	__dequeue_entity(cfs_rq, se);	1002	__dequeue_entity(cfs_rq, se);
		1003	se->on_rq = 0;
		1004	update_cfs_load(cfs_rq, 0);
828	account_entity_dequeue(cfs_rq, se);	1005	account_entity_dequeue(cfs_rq, se);
829	update_min_vruntime(cfs_rq);	1006	update_min_vruntime(cfs_rq);
		1007	update_cfs_shares(cfs_rq, 0);
830		1008
831	/*	1009	/*
832	* Normalize the entity after updating the min_vruntime because the	1010	* Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1233,13 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
1055	flags = ENQUEUE_WAKEUP;	1233	flags = ENQUEUE_WAKEUP;
1056	}	1234	}
1057		1235
		1236	for_each_sched_entity(se) {
		1237	struct cfs_rq *cfs_rq = cfs_rq_of(se);
		1238
		1239	update_cfs_load(cfs_rq, 0);
		1240	update_cfs_shares(cfs_rq, 0);
		1241	}
		1242
1058	hrtick_update(rq);	1243	hrtick_update(rq);
1059	}	1244	}
1060		1245
@@ -1071,12 +1256,20 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
1071	for_each_sched_entity(se) {	1256	for_each_sched_entity(se) {
1072	cfs_rq = cfs_rq_of(se);	1257	cfs_rq = cfs_rq_of(se);
1073	dequeue_entity(cfs_rq, se, flags);	1258	dequeue_entity(cfs_rq, se, flags);
		1259
1074	/* Don't dequeue parent if it has other entities besides us */	1260	/* Don't dequeue parent if it has other entities besides us */
1075	if (cfs_rq->load.weight)	1261	if (cfs_rq->load.weight)
1076	break;	1262	break;
1077	flags \|= DEQUEUE_SLEEP;	1263	flags \|= DEQUEUE_SLEEP;
1078	}	1264	}
1079		1265
		1266	for_each_sched_entity(se) {
		1267	struct cfs_rq *cfs_rq = cfs_rq_of(se);
		1268
		1269	update_cfs_load(cfs_rq, 0);
		1270	update_cfs_shares(cfs_rq, 0);
		1271	}
		1272
1080	hrtick_update(rq);	1273	hrtick_update(rq);
1081	}	1274	}
1082		1275
@@ -1143,51 +1336,20 @@ static void task_waking_fair(struct rq rq, struct task_struct p)
1143	* Adding load to a group doesn't make a group heavier, but can cause movement	1336	* Adding load to a group doesn't make a group heavier, but can cause movement
1144	* of group shares between cpus. Assuming the shares were perfectly aligned one	1337	* of group shares between cpus. Assuming the shares were perfectly aligned one
1145	* can calculate the shift in shares.	1338	* can calculate the shift in shares.
1146	*
1147	* The problem is that perfectly aligning the shares is rather expensive, hence
1148	* we try to avoid doing that too often - see update_shares(), which ratelimits
1149	* this change.
1150	*
1151	* We compensate this by not only taking the current delta into account, but
1152	* also considering the delta between when the shares were last adjusted and
1153	* now.
1154	*
1155	* We still saw a performance dip, some tracing learned us that between
1156	* cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157	* significantly. Therefore try to bias the error in direction of failing
1158	* the affine wakeup.
1159	*
1160	*/	1339	*/
1161	static long effective_load(struct task_group *tg, int cpu,	1340	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162	long wl, long wg)
1163	{	1341	{
1164	struct sched_entity *se = tg->se[cpu];	1342	struct sched_entity *se = tg->se[cpu];
1165		1343
1166	if (!tg->parent)	1344	if (!tg->parent)
1167	return wl;	1345	return wl;
1168		1346
1169	/*
1170	* By not taking the decrease of shares on the other cpu into
1171	* account our error leans towards reducing the affine wakeups.
1172	*/
1173	if (!wl && sched_feat(ASYM_EFF_LOAD))
1174	return wl;
1175
1176	for_each_sched_entity(se) {	1347	for_each_sched_entity(se) {
1177	long S, rw, s, a, b;	1348	long S, rw, s, a, b;
1178	long more_w;
1179
1180	/*
1181	* Instead of using this increment, also add the difference
1182	* between when the shares were last updated and now.
1183	*/
1184	more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185	wl += more_w;
1186	wg += more_w;
1187		1349
1188	S = se->my_q->tg->shares;	1350	S = se->my_q->tg->shares;
1189	s = se->my_q->shares;	1351	s = se->load.weight;
1190	rw = se->my_q->rq_weight;	1352	rw = se->my_q->load.weight;
1191		1353
1192	a = S*(rw + wl);	1354	a = S*(rw + wl);
1193	b = Srw + swg;	1355	b = Srw + swg;
@@ -1508,23 +1670,6 @@ select_task_rq_fair(struct rq rq, struct task_struct p, int sd_flag, int wake_
1508	sd = tmp;	1670	sd = tmp;
1509	}	1671	}
1510		1672
1511	#ifdef CONFIG_FAIR_GROUP_SCHED
1512	if (sched_feat(LB_SHARES_UPDATE)) {
1513	/*
1514	* Pick the largest domain to update shares over
1515	*/
1516	tmp = sd;
1517	if (affine_sd && (!tmp \|\| affine_sd->span_weight > sd->span_weight))
1518	tmp = affine_sd;
1519
1520	if (tmp) {
1521	raw_spin_unlock(&rq->lock);
1522	update_shares(tmp);
1523	raw_spin_lock(&rq->lock);
1524	}
1525	}
1526	#endif
1527
1528	if (affine_sd) {	1673	if (affine_sd) {
1529	if (cpu == prev_cpu \|\| wake_affine(affine_sd, p, sync))	1674	if (cpu == prev_cpu \|\| wake_affine(affine_sd, p, sync))
1530	return select_idle_sibling(p, cpu);	1675	return select_idle_sibling(p, cpu);
@@ -1909,6 +2054,48 @@ out:
1909	}	2054	}
1910		2055
1911	#ifdef CONFIG_FAIR_GROUP_SCHED	2056	#ifdef CONFIG_FAIR_GROUP_SCHED
		2057	/*
		2058	* update tg->load_weight by folding this cpu's load_avg
		2059	*/
		2060	static int update_shares_cpu(struct task_group *tg, int cpu)
		2061	{
		2062	struct cfs_rq *cfs_rq;
		2063	unsigned long flags;
		2064	struct rq *rq;
		2065
		2066	if (!tg->se[cpu])
		2067	return 0;
		2068
		2069	rq = cpu_rq(cpu);
		2070	cfs_rq = tg->cfs_rq[cpu];
		2071
		2072	raw_spin_lock_irqsave(&rq->lock, flags);
		2073
		2074	update_rq_clock(rq);
		2075	update_cfs_load(cfs_rq, 1);
		2076
		2077	/*
		2078	* We need to update shares after updating tg->load_weight in
		2079	* order to adjust the weight of groups with long running tasks.
		2080	*/
		2081	update_cfs_shares(cfs_rq, 0);
		2082
		2083	raw_spin_unlock_irqrestore(&rq->lock, flags);
		2084
		2085	return 0;
		2086	}
		2087
		2088	static void update_shares(int cpu)
		2089	{
		2090	struct cfs_rq *cfs_rq;
		2091	struct rq *rq = cpu_rq(cpu);
		2092
		2093	rcu_read_lock();
		2094	for_each_leaf_cfs_rq(rq, cfs_rq)
		2095	update_shares_cpu(cfs_rq->tg, cpu);
		2096	rcu_read_unlock();
		2097	}
		2098
1912	static unsigned long	2099	static unsigned long
1913	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	2100	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1914	unsigned long max_load_move,	2101	unsigned long max_load_move,
@@ -1956,6 +2143,10 @@ load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1956	return max_load_move - rem_load_move;	2143	return max_load_move - rem_load_move;
1957	}	2144	}
1958	#else	2145	#else
		2146	static inline void update_shares(int cpu)
		2147	{
		2148	}
		2149
1959	static unsigned long	2150	static unsigned long
1960	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	2151	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1961	unsigned long max_load_move,	2152	unsigned long max_load_move,
@@ -3032,7 +3223,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032	schedstat_inc(sd, lb_count[idle]);	3223	schedstat_inc(sd, lb_count[idle]);
3033		3224
3034	redo:	3225	redo:
3035	update_shares(sd);
3036	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,	3226	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037	cpus, balance);	3227	cpus, balance);
3038		3228
@@ -3174,8 +3364,6 @@ out_one_pinned:
3174	else	3364	else
3175	ld_moved = 0;	3365	ld_moved = 0;
3176	out:	3366	out:
3177	if (ld_moved)
3178	update_shares(sd);
3179	return ld_moved;	3367	return ld_moved;
3180	}	3368	}
3181		3369
@@ -3199,6 +3387,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199	*/	3387	*/
3200	raw_spin_unlock(&this_rq->lock);	3388	raw_spin_unlock(&this_rq->lock);
3201		3389
		3390	update_shares(this_cpu);
3202	for_each_domain(this_cpu, sd) {	3391	for_each_domain(this_cpu, sd) {
3203	unsigned long interval;	3392	unsigned long interval;
3204	int balance = 1;	3393	int balance = 1;
@@ -3569,6 +3758,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569	int update_next_balance = 0;	3758	int update_next_balance = 0;
3570	int need_serialize;	3759	int need_serialize;
3571		3760
		3761	update_shares(cpu);
		3762
3572	for_each_domain(cpu, sd) {	3763	for_each_domain(cpu, sd) {
3573	if (!(sd->flags & SD_LOAD_BALANCE))	3764	if (!(sd->flags & SD_LOAD_BALANCE))
3574	continue;	3765	continue;