Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched, x86: Avoid unnecessary overflow in sched_clock sched: Fix buglet in return_cfs_rq_runtime() sched: Avoid SMT siblings in select_idle_sibling() if possible sched: Set the command name of the idle tasks in SMP kernels sched, rt: Provide means of disabling cross-cpu bandwidth sharing sched: Document wait_for_completion_*() return values sched_fair: Fix a typo in the comment describing update_sd_lb_stats sched: Add a comment to effective_load() since it's a pain
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-12-05 19:50:24 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-12-05 19:50:24 -0500
commit: 7125faceabe43067293d0c9e2ef7154ecea51721 (patch)
tree: 9de559e9280843d02d9a70976fbad23428161189 /kernel
parent: 35337c834124d2893b7fe4ba683c7639e6c37e0c (diff)
parent: 4cecf6d401a01d054afc1e5f605bcbfe553cb9b9 (diff)
4 files changed, 146 insertions, 34 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a71be3..d6b149ccf925 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
+#include <linux/init_task.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. The timeout is in jiffies. It is not
 * interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
 */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
 *
 * This waits for completion of a specific task to be signaled. It is
 * interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 *
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 *
 * This waits to be signaled for completion of a specific task. It can be
 * interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_killable(struct completion *x)
 {
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * This waits for either a completion of a specific task to be
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_killable_timeout(struct completion *x,
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
 }
 /*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c9e67923b7c..a78ed2736ba7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                list_del_leaf_cfs_rq(cfs_rq);
 }
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+        long tg_weight;
+        /*
+         * Use this CPU's actual weight instead of the last load_contribution
+         * to gain a more accurate current total weight. See
+         * update_cfs_rq_load_contribution().
+         */
+        tg_weight = atomic_read(&tg->load_weight);
+        tg_weight -= cfs_rq->load_contribution;
+        tg_weight += cfs_rq->load.weight;
+        return tg_weight;
+}
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-        long load_weight, load, shares;
+        long tg_weight, load, shares;
+        tg_weight = calc_tg_weight(tg, cfs_rq);
        load = cfs_rq->load.weight;
-        load_weight = atomic_read(&tg->load_weight);
-        load_weight += load;
-        load_weight -= cfs_rq->load_contribution;
        shares = (tg->shares * load);
-        if (load_weight)
+        if (tg_weight)
-                shares /= load_weight;
+                shares /= tg_weight;
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-        if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
                return;
        __return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ *   s_i = rw_i / \Sum rw_j                                             (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ *   rw_i = {   2,   4,   1,   0 }
+ *   s_i  = { 2/7, 4/7, 1/7,   0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ *   rw'_i = {   3,   4,   1,   0 }
+ *   s'_i  = { 3/8, 4/8, 1/8,   0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ *   dw_i = S * (s'_i - s_i)                                            (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
 */
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)
+        if (!tg->parent)        /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
-                long lw, w;
+                long w, W;
                tg = se->my_q->tg;
-                w = se->my_q->load.weight;
-                /* use this cpu's instantaneous contribution */
+                /*
-                lw = atomic_read(&tg->load_weight);
+                 * W = @wg + \Sum rw_j
-                lw -= se->my_q->load_contribution;
+                 */
-                lw += w + wg;
+                W = wg + calc_tg_weight(tg, se->my_q);
-                wl += w;
+                /*
+                 * w = rw_i + @wl
+                 */
+                w = se->my_q->load.weight + wl;
-                if (lw > 0 && wl < lw)
+                /*
-                        wl = (wl * tg->shares) / lw;
+                 * wl = S * s'_i; see (2)
+                 */
+                if (W > 0 && w < W)
+                        wl = (w * tg->shares) / W;
                else
                        wl = tg->shares;
-                /* zero point is MIN_SHARES */
+                /*
+                 * Per the above, wl is the new se->load.weight value; since
+                 * those are clipped to [MIN_SHARES, ...) do so now. See
+                 * calc_cfs_shares().
+                 */
                if (wl < MIN_SHARES)
                        wl = MIN_SHARES;
+                /*
+                 * wl = dw_i = S * (s'_i - s_i); see (3)
+                 */
                wl -= se->load.weight;
+                /*
+                 * Recursively apply this logic to all parent groups to compute
+                 * the final effective load change on the root group. Since
+                 * only the @tg group gets extra weight, all parent groups can
+                 * only redistribute existing shares. @wl is the shift in shares
+                 * resulting from this level per the above.
+                 */
                wg = 0;
        }
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        int i;
+        struct sched_group *sg;
+        int i, smt = 0;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        rcu_read_lock();
+again:
        for_each_domain(target, sd) {
-                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+                if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
-                        break;
+                        continue;
-                for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
+                if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
-                        if (idle_cpu(i)) {
+                        if (!smt) {
-                                target = i;
+                                smt = 1;
-                                break;
+                                goto again;
                        }
+                        break;
                }
-                /*
+                sg = sd->groups;
-                 * Lets stop looking for an idle sibling when we reached
+                do {
-                 * the domain that spans the current cpu and prev_cpu.
+                        if (!cpumask_intersects(sched_group_cpus(sg),
-                 */
+                                                tsk_cpus_allowed(p)))
-                if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                                goto next;
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
-                        break;
+                        for_each_cpu(i, sched_group_cpus(sg)) {
+                                if (!idle_cpu(i))
+                                        goto next;
+                        }
+                        target = cpumask_first_and(sched_group_cpus(sg),
+                                        tsk_cpus_allowed(p));
+                        goto done;
+next:
+                        sg = sg->next;
+                } while (sg != sd->groups);
        }
+done:
        rcu_read_unlock();
        return target;
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 }
 /**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index efa0a7b75dde..84802245abd2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
 SCHED_FEAT(TTWU_QUEUE, 1)
 SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 056cbd2e2a27..583a1368afe6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
 {
        int more = 0;
+        if (!sched_feat(RT_RUNTIME_SHARE))
+                return more;
        if (rt_rq->rt_time > rt_rq->rt_runtime) {
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                more = do_balance_runtime(rt_rq);
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-12-05 19:50:24 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-12-05 19:50:24 -0500
commit	7125faceabe43067293d0c9e2ef7154ecea51721 (patch)
tree	9de559e9280843d02d9a70976fbad23428161189 /kernel
parent	35337c834124d2893b7fe4ba683c7639e6c37e0c (diff)
parent	4cecf6d401a01d054afc1e5f605bcbfe553cb9b9 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be3..d6b149ccf925 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -71,6 +71,7 @@
71	#include <linux/ctype.h>	71	#include <linux/ctype.h>
72	#include <linux/ftrace.h>	72	#include <linux/ftrace.h>
73	#include <linux/slab.h>	73	#include <linux/slab.h>
		74	#include <linux/init_task.h>
74		75
75	#include <asm/tlb.h>	76	#include <asm/tlb.h>
76	#include <asm/irq_regs.h>	77	#include <asm/irq_regs.h>
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
4810	* This waits for either a completion of a specific task to be signaled or for a	4811	* This waits for either a completion of a specific task to be signaled or for a
4811	* specified timeout to expire. The timeout is in jiffies. It is not	4812	* specified timeout to expire. The timeout is in jiffies. It is not
4812	* interruptible.	4813	* interruptible.
		4814	*
		4815	* The return value is 0 if timed out, and positive (at least 1, or number of
		4816	* jiffies left till timeout) if completed.
4813	*/	4817	*/
4814	unsigned long __sched	4818	unsigned long __sched
4815	wait_for_completion_timeout(struct completion *x, unsigned long timeout)	4819	wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
4824	*	4828	*
4825	* This waits for completion of a specific task to be signaled. It is	4829	* This waits for completion of a specific task to be signaled. It is
4826	* interruptible.	4830	* interruptible.
		4831	*
		4832	* The return value is -ERESTARTSYS if interrupted, 0 if completed.
4827	*/	4833	*/
4828	int __sched wait_for_completion_interruptible(struct completion *x)	4834	int __sched wait_for_completion_interruptible(struct completion *x)
4829	{	4835	{
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4841	*	4847	*
4842	* This waits for either a completion of a specific task to be signaled or for a	4848	* This waits for either a completion of a specific task to be signaled or for a
4843	* specified timeout to expire. It is interruptible. The timeout is in jiffies.	4849	* specified timeout to expire. It is interruptible. The timeout is in jiffies.
		4850	*
		4851	* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
		4852	* positive (at least 1, or number of jiffies left till timeout) if completed.
4844	*/	4853	*/
4845	long __sched	4854	long __sched
4846	wait_for_completion_interruptible_timeout(struct completion *x,	4855	wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4856	*	4865	*
4857	* This waits to be signaled for completion of a specific task. It can be	4866	* This waits to be signaled for completion of a specific task. It can be
4858	* interrupted by a kill signal.	4867	* interrupted by a kill signal.
		4868	*
		4869	* The return value is -ERESTARTSYS if interrupted, 0 if completed.
4859	*/	4870	*/
4860	int __sched wait_for_completion_killable(struct completion *x)	4871	int __sched wait_for_completion_killable(struct completion *x)
4861	{	4872	{
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4874	* This waits for either a completion of a specific task to be	4885	* This waits for either a completion of a specific task to be
4875	* signaled or for a specified timeout to expire. It can be	4886	* signaled or for a specified timeout to expire. It can be
4876	* interrupted by a kill signal. The timeout is in jiffies.	4887	* interrupted by a kill signal. The timeout is in jiffies.
		4888	*
		4889	* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
		4890	* positive (at least 1, or number of jiffies left till timeout) if completed.
4877	*/	4891	*/
4878	long __sched	4892	long __sched
4879	wait_for_completion_killable_timeout(struct completion *x,	4893	wait_for_completion_killable_timeout(struct completion *x,
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6099	*/	6113	*/
6100	idle->sched_class = &idle_sched_class;	6114	idle->sched_class = &idle_sched_class;
6101	ftrace_graph_init_idle_task(idle, cpu);	6115	ftrace_graph_init_idle_task(idle, cpu);
		6116	#if defined(CONFIG_SMP)
		6117	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
		6118	#endif
6102	}	6119	}
6103		6120
6104	/*	6121	/*


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c9e67923b7c..a78ed2736ba7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
772	list_del_leaf_cfs_rq(cfs_rq);	772	list_del_leaf_cfs_rq(cfs_rq);
773	}	773	}
774		774
		775	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
		776	{
		777	long tg_weight;
		778
		779	/*
		780	* Use this CPU's actual weight instead of the last load_contribution
		781	* to gain a more accurate current total weight. See
		782	* update_cfs_rq_load_contribution().
		783	*/
		784	tg_weight = atomic_read(&tg->load_weight);
		785	tg_weight -= cfs_rq->load_contribution;
		786	tg_weight += cfs_rq->load.weight;
		787
		788	return tg_weight;
		789	}
		790
775	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)	791	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
776	{	792	{
777	long load_weight, load, shares;	793	long tg_weight, load, shares;
778		794
		795	tg_weight = calc_tg_weight(tg, cfs_rq);
779	load = cfs_rq->load.weight;	796	load = cfs_rq->load.weight;
780		797
781	load_weight = atomic_read(&tg->load_weight);
782	load_weight += load;
783	load_weight -= cfs_rq->load_contribution;
784
785	shares = (tg->shares * load);	798	shares = (tg->shares * load);
786	if (load_weight)	799	if (tg_weight)
787	shares /= load_weight;	800	shares /= tg_weight;
788		801
789	if (shares < MIN_SHARES)	802	if (shares < MIN_SHARES)
790	shares = MIN_SHARES;	803	shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1743		1756
1744	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)	1757	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745	{	1758	{
1746	if (!cfs_rq->runtime_enabled \|\| !cfs_rq->nr_running)	1759	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
1747	return;	1760	return;
1748		1761
1749	__return_cfs_rq_runtime(cfs_rq);	1762	__return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
2036	* Adding load to a group doesn't make a group heavier, but can cause movement	2049	* Adding load to a group doesn't make a group heavier, but can cause movement
2037	* of group shares between cpus. Assuming the shares were perfectly aligned one	2050	* of group shares between cpus. Assuming the shares were perfectly aligned one
2038	* can calculate the shift in shares.	2051	* can calculate the shift in shares.
		2052	*
		2053	* Calculate the effective load difference if @wl is added (subtracted) to @tg
		2054	* on this @cpu and results in a total addition (subtraction) of @wg to the
		2055	* total group weight.
		2056	*
		2057	* Given a runqueue weight distribution (rw_i) we can compute a shares
		2058	* distribution (s_i) using:
		2059	*
		2060	* s_i = rw_i / \Sum rw_j (1)
		2061	*
		2062	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
		2063	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
		2064	* shares distribution (s_i):
		2065	*
		2066	* rw_i = { 2, 4, 1, 0 }
		2067	* s_i = { 2/7, 4/7, 1/7, 0 }
		2068	*
		2069	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
		2070	* task used to run on and the CPU the waker is running on), we need to
		2071	* compute the effect of waking a task on either CPU and, in case of a sync
		2072	* wakeup, compute the effect of the current task going to sleep.
		2073	*
		2074	* So for a change of @wl to the local @cpu with an overall group weight change
		2075	* of @wl we can compute the new shares distribution (s'_i) using:
		2076	*
		2077	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
		2078	*
		2079	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
		2080	* differences in waking a task to CPU 0. The additional task changes the
		2081	* weight and shares distributions like:
		2082	*
		2083	* rw'_i = { 3, 4, 1, 0 }
		2084	* s'_i = { 3/8, 4/8, 1/8, 0 }
		2085	*
		2086	* We can then compute the difference in effective weight by using:
		2087	*
		2088	* dw_i = S * (s'_i - s_i) (3)
		2089	*
		2090	* Where 'S' is the group weight as seen by its parent.
		2091	*
		2092	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
		2093	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
		2094	* 4/7) times the weight of the group.
2039	*/	2095	*/
2040	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)	2096	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2041	{	2097	{
2042	struct sched_entity *se = tg->se[cpu];	2098	struct sched_entity *se = tg->se[cpu];
2043		2099
2044	if (!tg->parent)	2100	if (!tg->parent) /* the trivial, non-cgroup case */
2045	return wl;	2101	return wl;
2046		2102
2047	for_each_sched_entity(se) {	2103	for_each_sched_entity(se) {
2048	long lw, w;	2104	long w, W;
2049		2105
2050	tg = se->my_q->tg;	2106	tg = se->my_q->tg;
2051	w = se->my_q->load.weight;
2052		2107
2053	/* use this cpu's instantaneous contribution */	2108	/*
2054	lw = atomic_read(&tg->load_weight);	2109	* W = @wg + \Sum rw_j
2055	lw -= se->my_q->load_contribution;	2110	*/
2056	lw += w + wg;	2111	W = wg + calc_tg_weight(tg, se->my_q);
2057		2112
2058	wl += w;	2113	/*
		2114	* w = rw_i + @wl
		2115	*/
		2116	w = se->my_q->load.weight + wl;
2059		2117
2060	if (lw > 0 && wl < lw)	2118	/*
2061	wl = (wl * tg->shares) / lw;	2119	* wl = S * s'_i; see (2)
		2120	*/
		2121	if (W > 0 && w < W)
		2122	wl = (w * tg->shares) / W;
2062	else	2123	else
2063	wl = tg->shares;	2124	wl = tg->shares;
2064		2125
2065	/* zero point is MIN_SHARES */	2126	/*
		2127	* Per the above, wl is the new se->load.weight value; since
		2128	* those are clipped to [MIN_SHARES, ...) do so now. See
		2129	* calc_cfs_shares().
		2130	*/
2066	if (wl < MIN_SHARES)	2131	if (wl < MIN_SHARES)
2067	wl = MIN_SHARES;	2132	wl = MIN_SHARES;
		2133
		2134	/*
		2135	* wl = dw_i = S * (s'_i - s_i); see (3)
		2136	*/
2068	wl -= se->load.weight;	2137	wl -= se->load.weight;
		2138
		2139	/*
		2140	* Recursively apply this logic to all parent groups to compute
		2141	* the final effective load change on the root group. Since
		2142	* only the @tg group gets extra weight, all parent groups can
		2143	* only redistribute existing shares. @wl is the shift in shares
		2144	* resulting from this level per the above.
		2145	*/
2069	wg = 0;	2146	wg = 0;
2070	}	2147	}
2071		2148
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
2249	int cpu = smp_processor_id();	2326	int cpu = smp_processor_id();
2250	int prev_cpu = task_cpu(p);	2327	int prev_cpu = task_cpu(p);
2251	struct sched_domain *sd;	2328	struct sched_domain *sd;
2252	int i;	2329	struct sched_group *sg;
		2330	int i, smt = 0;
2253		2331
2254	/*	2332	/*
2255	* If the task is going to be woken-up on this cpu and if it is	2333	* If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
2269	* Otherwise, iterate the domains and find an elegible idle cpu.	2347	* Otherwise, iterate the domains and find an elegible idle cpu.
2270	*/	2348	*/
2271	rcu_read_lock();	2349	rcu_read_lock();
		2350	again:
2272	for_each_domain(target, sd) {	2351	for_each_domain(target, sd) {
2273	if (!(sd->flags & SD_SHARE_PKG_RESOURCES))	2352	if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2274	break;	2353	continue;
2275		2354
2276	for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {	2355	if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
2277	if (idle_cpu(i)) {	2356	if (!smt) {
2278	target = i;	2357	smt = 1;
2279	break;	2358	goto again;
2280	}	2359	}
		2360	break;
2281	}	2361	}
2282		2362
2283	/*	2363	sg = sd->groups;
2284	* Lets stop looking for an idle sibling when we reached	2364	do {
2285	* the domain that spans the current cpu and prev_cpu.	2365	if (!cpumask_intersects(sched_group_cpus(sg),
2286	*/	2366	tsk_cpus_allowed(p)))
2287	if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&	2367	goto next;
2288	cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))	2368
2289	break;	2369	for_each_cpu(i, sched_group_cpus(sg)) {
		2370	if (!idle_cpu(i))
		2371	goto next;
		2372	}
		2373
		2374	target = cpumask_first_and(sched_group_cpus(sg),
		2375	tsk_cpus_allowed(p));
		2376	goto done;
		2377	next:
		2378	sg = sg->next;
		2379	} while (sg != sd->groups);
2290	}	2380	}
		2381	done:
2291	rcu_read_unlock();	2382	rcu_read_unlock();
2292		2383
2293	return target;	2384	return target;
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3511	}	3602	}
3512		3603
3513	/**	3604	/**
3514	* update_sd_lb_stats - Update sched_group's statistics for load balancing.	3605	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3515	* @sd: sched_domain whose statistics are to be updated.	3606	* @sd: sched_domain whose statistics are to be updated.
3516	* @this_cpu: Cpu for which load balance is currently performed.	3607	* @this_cpu: Cpu for which load balance is currently performed.
3517	* @idle: Idle status of this_cpu	3608	* @idle: Idle status of this_cpu


diff --git a/kernel/sched_features.h b/kernel/sched_features.h index efa0a7b75dde..84802245abd2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
67	SCHED_FEAT(TTWU_QUEUE, 1)	67	SCHED_FEAT(TTWU_QUEUE, 1)
68		68
69	SCHED_FEAT(FORCE_SD_OVERLAP, 0)	69	SCHED_FEAT(FORCE_SD_OVERLAP, 0)
		70	SCHED_FEAT(RT_RUNTIME_SHARE, 1)


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 056cbd2e2a27..583a1368afe6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
560	{	560	{
561	int more = 0;	561	int more = 0;
562		562
		563	if (!sched_feat(RT_RUNTIME_SHARE))
		564	return more;
		565
563	if (rt_rq->rt_time > rt_rq->rt_runtime) {	566	if (rt_rq->rt_time > rt_rq->rt_runtime) {
564	raw_spin_unlock(&rt_rq->rt_runtime_lock);	567	raw_spin_unlock(&rt_rq->rt_runtime_lock);
565	more = do_balance_runtime(rt_rq);	568	more = do_balance_runtime(rt_rq);