1 files changed, 44 insertions, 129 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index b0d5f1b24a39..e2f1a3024a99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
        unsigned long shares;
+        atomic_t load_weight;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
         */
        unsigned long h_load;
-        /*
+        u64 load_avg;
-         * this cpu's part of tg->shares
+        u64 load_period;
-         */
+        u64 load_stamp;
-        unsigned long shares;
-        /*
+        unsigned long load_contribution;
-         * load.weight at the time we set shares
-         */
-        unsigned long rq_weight;
 #endif
 #endif
 };
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
        lw->inv_weight = 0;
 }
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
+static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                    unsigned long sd_shares,
-                                    unsigned long sd_rq_weight,
-                                    unsigned long *usd_rq_weight)
-{
-        unsigned long shares, rq_weight;
-        int boost = 0;
-        rq_weight = usd_rq_weight[cpu];
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        /*
-         *             \Sum_j shares_j * rq_weight_i
-         * shares_i =  -----------------------------
-         *                  \Sum_j rq_weight_j
-         */
-        shares = (sd_shares * rq_weight) / sd_rq_weight;
-        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-        if (abs(shares - tg->se[cpu]->load.weight) >
-                        sysctl_sched_shares_thresh) {
-                struct rq *rq = cpu_rq(cpu);
-                unsigned long flags;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                __set_se_shares(tg->se[cpu], shares);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-}
 /*
- * Re-compute the task group their per cpu shares over the given domain.
+ * update tg->load_weight by folding this cpu's load_avg
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
+        long load_avg;
-        unsigned long *usd_rq_weight;
+        struct cfs_rq *cfs_rq;
-        struct sched_domain *sd = data;
        unsigned long flags;
-        int i;
+        int cpu = (long)data;
+        struct rq *rq;
-        if (!tg->se[0])
+        if (!tg->se[cpu])
                return 0;
-        local_irq_save(flags);
+        rq = cpu_rq(cpu);
-        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
+        cfs_rq = tg->cfs_rq[cpu];
-        for_each_cpu(i, sched_domain_span(sd)) {
-                weight = tg->cfs_rq[i]->load.weight;
-                usd_rq_weight[i] = weight;
-                rq_weight += weight;
-                /*
-                 * If there are currently no tasks on the cpu pretend there
-                 * is one of average load so that when a new task gets to
-                 * run here it will not get delayed by group starvation.
-                 */
-                if (!weight)
-                        weight = NICE_0_LOAD;
-                sum_weight += weight;
+        raw_spin_lock_irqsave(&rq->lock, flags);
-                shares += tg->cfs_rq[i]->shares;
-        }
-        if (!rq_weight)
+        update_rq_clock(rq);
-                rq_weight = sum_weight;
+        update_cfs_load(cfs_rq);
-        if ((!shares && rq_weight) || shares > tg->shares)
+        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-                shares = tg->shares;
+        load_avg -= cfs_rq->load_contribution;
-        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+        atomic_add(load_avg, &tg->load_weight);
-                shares = tg->shares;
+        cfs_rq->load_contribution += load_avg;
-        for_each_cpu(i, sched_domain_span(sd))
+        /*
-                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
+         * We need to update shares after updating tg->load_weight in
+         * order to adjust the weight of groups with long running tasks.
+         */
+        update_cfs_shares(cfs_rq);
-        local_irq_restore(flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        return 0;
 }
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                load = cpu_rq(cpu)->load.weight;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->cfs_rq[cpu]->shares;
+                load *= tg->se[cpu]->load.weight;
                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
        }
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
        return 0;
 }
-static void update_shares(struct sched_domain *sd)
+static void update_shares(long cpu)
 {
-        s64 elapsed;
-        u64 now;
        if (root_task_group_empty())
                return;
-        now = local_clock();
+        /*
-        elapsed = now - sd->last_update;
+         * XXX: replace with an on-demand list
+         */
-        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+        walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
-                sd->last_update = now;
-                walk_tg_tree(tg_nop, tg_shares_up, sd);
-        }
 }
 static void update_h_load(long cpu)
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
 #else
-static inline void update_shares(struct sched_domain *sd)
+static inline void update_shares(int cpu)
 {
 }
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-        cfs_rq->shares = shares;
-#endif
-}
-#endif
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
        SET_SYSCTL(sched_min_granularity);
        SET_SYSCTL(sched_latency);
        SET_SYSCTL(sched_wakeup_granularity);
-        SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        se->load.weight = tg->shares;
+        update_load_set(&se->load, tg->shares);
-        se->load.inv_weight = 0;
        se->parent = parent;
 }
 #endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                            __alignof__(unsigned long));
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
-        se->load.weight = shares;
+        update_load_set(&se->load, shares);
-        se->load.inv_weight = 0;
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                /*
                 * force a rebalance
                 */
-                cfs_rq_set_shares(tg->cfs_rq[i], 0);
                set_se_shares(tg->se[i], shares);
        }

diff --git a/kernel/sched.c b/kernel/sched.c index b0d5f1b24a39..e2f1a3024a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
253	/* runqueue "owned" by this group on each cpu */	253	/* runqueue "owned" by this group on each cpu */
254	struct cfs_rq **cfs_rq;	254	struct cfs_rq **cfs_rq;
255	unsigned long shares;	255	unsigned long shares;
		256
		257	atomic_t load_weight;
256	#endif	258	#endif
257		259
258	#ifdef CONFIG_RT_GROUP_SCHED	260	#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
359	*/	361	*/
360	unsigned long h_load;	362	unsigned long h_load;
361		363
362	/*	364	u64 load_avg;
363	* this cpu's part of tg->shares	365	u64 load_period;
364	*/	366	u64 load_stamp;
365	unsigned long shares;
366		367
367	/*	368	unsigned long load_contribution;
368	* load.weight at the time we set shares
369	*/
370	unsigned long rq_weight;
371	#endif	369	#endif
372	#endif	370	#endif
373	};	371	};
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug);
807	const_debug unsigned int sysctl_sched_nr_migrate = 32;	805	const_debug unsigned int sysctl_sched_nr_migrate = 32;
808		806
809	/*	807	/*
810	* ratelimit for updating the group shares.
811	* default: 0.25ms
812	*/
813	unsigned int sysctl_sched_shares_ratelimit = 250000;
814	unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816	/*
817	* Inject some fuzzyness into changing the per-cpu group shares
818	* this avoids remote rq-locks at the expense of fairness.
819	* default: 4
820	*/
821	unsigned int sysctl_sched_shares_thresh = 4;
822
823	/*
824	* period over which we average the RT time consumption, measured	808	* period over which we average the RT time consumption, measured
825	* in ms.	809	* in ms.
826	*	810	*
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369	lw->inv_weight = 0;	1353	lw->inv_weight = 0;
1370	}	1354	}
1371		1355
		1356	static inline void update_load_set(struct load_weight *lw, unsigned long w)
		1357	{
		1358	lw->weight = w;
		1359	lw->inv_weight = 0;
		1360	}
		1361
1372	/*	1362	/*
1373	* To aid in avoiding the subversion of "niceness" due to uneven distribution	1363	* To aid in avoiding the subversion of "niceness" due to uneven distribution
1374	* of tasks with abnormal "nice" values across CPUs the contribution that	1364	* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557		1547
1558	#ifdef CONFIG_FAIR_GROUP_SCHED	1548	#ifdef CONFIG_FAIR_GROUP_SCHED
1559		1549
1560	static __read_mostly unsigned long __percpu *update_shares_data;	1550	static void update_cfs_load(struct cfs_rq *cfs_rq);
1561		1551	static void update_cfs_shares(struct cfs_rq *cfs_rq);
1562	static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564	/*
1565	* Calculate and set the cpu's group shares.
1566	*/
1567	static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568	unsigned long sd_shares,
1569	unsigned long sd_rq_weight,
1570	unsigned long *usd_rq_weight)
1571	{
1572	unsigned long shares, rq_weight;
1573	int boost = 0;
1574
1575	rq_weight = usd_rq_weight[cpu];
1576	if (!rq_weight) {
1577	boost = 1;
1578	rq_weight = NICE_0_LOAD;
1579	}
1580
1581	/*
1582	* \Sum_j shares_j * rq_weight_i
1583	* shares_i = -----------------------------
1584	* \Sum_j rq_weight_j
1585	*/
1586	shares = (sd_shares * rq_weight) / sd_rq_weight;
1587	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589	if (abs(shares - tg->se[cpu]->load.weight) >
1590	sysctl_sched_shares_thresh) {
1591	struct rq *rq = cpu_rq(cpu);
1592	unsigned long flags;
1593
1594	raw_spin_lock_irqsave(&rq->lock, flags);
1595	tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597	__set_se_shares(tg->se[cpu], shares);
1598	raw_spin_unlock_irqrestore(&rq->lock, flags);
1599	}
1600	}
1601		1552
1602	/*	1553	/*
1603	* Re-compute the task group their per cpu shares over the given domain.	1554	* update tg->load_weight by folding this cpu's load_avg
1604	* This needs to be done in a bottom-up fashion because the rq weight of a
1605	* parent group depends on the shares of its child groups.
1606	*/	1555	*/
1607	static int tg_shares_up(struct task_group tg, void data)	1556	static int tg_shares_up(struct task_group tg, void data)
1608	{	1557	{
1609	unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;	1558	long load_avg;
1610	unsigned long *usd_rq_weight;	1559	struct cfs_rq *cfs_rq;
1611	struct sched_domain *sd = data;
1612	unsigned long flags;	1560	unsigned long flags;
1613	int i;	1561	int cpu = (long)data;
		1562	struct rq *rq;
1614		1563
1615	if (!tg->se[0])	1564	if (!tg->se[cpu])
1616	return 0;	1565	return 0;
1617		1566
1618	local_irq_save(flags);	1567	rq = cpu_rq(cpu);
1619	usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());	1568	cfs_rq = tg->cfs_rq[cpu];
1620
1621	for_each_cpu(i, sched_domain_span(sd)) {
1622	weight = tg->cfs_rq[i]->load.weight;
1623	usd_rq_weight[i] = weight;
1624
1625	rq_weight += weight;
1626	/*
1627	* If there are currently no tasks on the cpu pretend there
1628	* is one of average load so that when a new task gets to
1629	* run here it will not get delayed by group starvation.
1630	*/
1631	if (!weight)
1632	weight = NICE_0_LOAD;
1633		1569
1634	sum_weight += weight;	1570	raw_spin_lock_irqsave(&rq->lock, flags);
1635	shares += tg->cfs_rq[i]->shares;
1636	}
1637		1571
1638	if (!rq_weight)	1572	update_rq_clock(rq);
1639	rq_weight = sum_weight;	1573	update_cfs_load(cfs_rq);
1640		1574
1641	if ((!shares && rq_weight) \|\| shares > tg->shares)	1575	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
1642	shares = tg->shares;	1576	load_avg -= cfs_rq->load_contribution;
1643		1577
1644	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))	1578	atomic_add(load_avg, &tg->load_weight);
1645	shares = tg->shares;	1579	cfs_rq->load_contribution += load_avg;
1646		1580
1647	for_each_cpu(i, sched_domain_span(sd))	1581	/*
1648	update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);	1582	* We need to update shares after updating tg->load_weight in
		1583	* order to adjust the weight of groups with long running tasks.
		1584	*/
		1585	update_cfs_shares(cfs_rq);
1649		1586
1650	local_irq_restore(flags);	1587	raw_spin_unlock_irqrestore(&rq->lock, flags);
1651		1588
1652	return 0;	1589	return 0;
1653	}	1590	}
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group tg, void data)
1666	load = cpu_rq(cpu)->load.weight;	1603	load = cpu_rq(cpu)->load.weight;
1667	} else {	1604	} else {
1668	load = tg->parent->cfs_rq[cpu]->h_load;	1605	load = tg->parent->cfs_rq[cpu]->h_load;
1669	load *= tg->cfs_rq[cpu]->shares;	1606	load *= tg->se[cpu]->load.weight;
1670	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;	1607	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671	}	1608	}
1672		1609
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group tg, void data)
1675	return 0;	1612	return 0;
1676	}	1613	}
1677		1614
1678	static void update_shares(struct sched_domain *sd)	1615	static void update_shares(long cpu)
1679	{	1616	{
1680	s64 elapsed;
1681	u64 now;
1682
1683	if (root_task_group_empty())	1617	if (root_task_group_empty())
1684	return;	1618	return;
1685		1619
1686	now = local_clock();	1620	/*
1687	elapsed = now - sd->last_update;	1621	* XXX: replace with an on-demand list
		1622	*/
1688		1623
1689	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {	1624	walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
1690	sd->last_update = now;
1691	walk_tg_tree(tg_nop, tg_shares_up, sd);
1692	}
1693	}	1625	}
1694		1626
1695	static void update_h_load(long cpu)	1627	static void update_h_load(long cpu)
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
1699		1631
1700	#else	1632	#else
1701		1633
1702	static inline void update_shares(struct sched_domain *sd)	1634	static inline void update_shares(int cpu)
1703	{	1635	{
1704	}	1636	}
1705		1637
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq rq1, struct rq rq2)
1824		1756
1825	#endif	1757	#endif
1826		1758
1827	#ifdef CONFIG_FAIR_GROUP_SCHED
1828	static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829	{
1830	#ifdef CONFIG_SMP
1831	cfs_rq->shares = shares;
1832	#endif
1833	}
1834	#endif
1835
1836	static void calc_load_account_idle(struct rq *this_rq);	1759	static void calc_load_account_idle(struct rq *this_rq);
1837	static void update_sysctl(void);	1760	static void update_sysctl(void);
1838	static int get_update_sysctl_factor(void);	1761	static int get_update_sysctl_factor(void);
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
5551	SET_SYSCTL(sched_min_granularity);	5474	SET_SYSCTL(sched_min_granularity);
5552	SET_SYSCTL(sched_latency);	5475	SET_SYSCTL(sched_latency);
5553	SET_SYSCTL(sched_wakeup_granularity);	5476	SET_SYSCTL(sched_wakeup_granularity);
5554	SET_SYSCTL(sched_shares_ratelimit);
5555	#undef SET_SYSCTL	5477	#undef SET_SYSCTL
5556	}	5478	}
5557		5479
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
7787	se->cfs_rq = parent->my_q;	7709	se->cfs_rq = parent->my_q;
7788		7710
7789	se->my_q = cfs_rq;	7711	se->my_q = cfs_rq;
7790	se->load.weight = tg->shares;	7712	update_load_set(&se->load, tg->shares);
7791	se->load.inv_weight = 0;
7792	se->parent = parent;	7713	se->parent = parent;
7793	}	7714	}
7794	#endif	7715	#endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
7881		7802
7882	#endif /* CONFIG_CGROUP_SCHED */	7803	#endif /* CONFIG_CGROUP_SCHED */
7883		7804
7884	#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7885	update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7886	__alignof__(unsigned long));
7887	#endif
7888	for_each_possible_cpu(i) {	7805	for_each_possible_cpu(i) {
7889	struct rq *rq;	7806	struct rq *rq;
7890		7807
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8452	if (on_rq)	8369	if (on_rq)
8453	dequeue_entity(cfs_rq, se, 0);	8370	dequeue_entity(cfs_rq, se, 0);
8454		8371
8455	se->load.weight = shares;	8372	update_load_set(&se->load, shares);
8456	se->load.inv_weight = 0;
8457		8373
8458	if (on_rq)	8374	if (on_rq)
8459	enqueue_entity(cfs_rq, se, 0);	8375	enqueue_entity(cfs_rq, se, 0);
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510	/*	8426	/*
8511	* force a rebalance	8427	* force a rebalance
8512	*/	8428	*/
8513	cfs_rq_set_shares(tg->cfs_rq[i], 0);
8514	set_se_shares(tg->se[i], shares);	8429	set_se_shares(tg->se[i], shares);
8515	}	8430	}
8516		8431