diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-12-05 19:50:24 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-12-05 19:50:24 -0500 |
commit | 7125faceabe43067293d0c9e2ef7154ecea51721 (patch) | |
tree | 9de559e9280843d02d9a70976fbad23428161189 /kernel | |
parent | 35337c834124d2893b7fe4ba683c7639e6c37e0c (diff) | |
parent | 4cecf6d401a01d054afc1e5f605bcbfe553cb9b9 (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched, x86: Avoid unnecessary overflow in sched_clock
sched: Fix buglet in return_cfs_rq_runtime()
sched: Avoid SMT siblings in select_idle_sibling() if possible
sched: Set the command name of the idle tasks in SMP kernels
sched, rt: Provide means of disabling cross-cpu bandwidth sharing
sched: Document wait_for_completion_*() return values
sched_fair: Fix a typo in the comment describing update_sd_lb_stats
sched: Add a comment to effective_load() since it's a pain
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 17 | ||||
-rw-r--r-- | kernel/sched_fair.c | 159 | ||||
-rw-r--r-- | kernel/sched_features.h | 1 | ||||
-rw-r--r-- | kernel/sched_rt.c | 3 |
4 files changed, 146 insertions, 34 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be3..d6b149ccf925 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <linux/ctype.h> | 71 | #include <linux/ctype.h> |
72 | #include <linux/ftrace.h> | 72 | #include <linux/ftrace.h> |
73 | #include <linux/slab.h> | 73 | #include <linux/slab.h> |
74 | #include <linux/init_task.h> | ||
74 | 75 | ||
75 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion); | |||
4810 | * This waits for either a completion of a specific task to be signaled or for a | 4811 | * This waits for either a completion of a specific task to be signaled or for a |
4811 | * specified timeout to expire. The timeout is in jiffies. It is not | 4812 | * specified timeout to expire. The timeout is in jiffies. It is not |
4812 | * interruptible. | 4813 | * interruptible. |
4814 | * | ||
4815 | * The return value is 0 if timed out, and positive (at least 1, or number of | ||
4816 | * jiffies left till timeout) if completed. | ||
4813 | */ | 4817 | */ |
4814 | unsigned long __sched | 4818 | unsigned long __sched |
4815 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4819 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); | |||
4824 | * | 4828 | * |
4825 | * This waits for completion of a specific task to be signaled. It is | 4829 | * This waits for completion of a specific task to be signaled. It is |
4826 | * interruptible. | 4830 | * interruptible. |
4831 | * | ||
4832 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
4827 | */ | 4833 | */ |
4828 | int __sched wait_for_completion_interruptible(struct completion *x) | 4834 | int __sched wait_for_completion_interruptible(struct completion *x) |
4829 | { | 4835 | { |
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4841 | * | 4847 | * |
4842 | * This waits for either a completion of a specific task to be signaled or for a | 4848 | * This waits for either a completion of a specific task to be signaled or for a |
4843 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4849 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4850 | * | ||
4851 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
4852 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
4844 | */ | 4853 | */ |
4845 | long __sched | 4854 | long __sched |
4846 | wait_for_completion_interruptible_timeout(struct completion *x, | 4855 | wait_for_completion_interruptible_timeout(struct completion *x, |
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | |||
4856 | * | 4865 | * |
4857 | * This waits to be signaled for completion of a specific task. It can be | 4866 | * This waits to be signaled for completion of a specific task. It can be |
4858 | * interrupted by a kill signal. | 4867 | * interrupted by a kill signal. |
4868 | * | ||
4869 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
4859 | */ | 4870 | */ |
4860 | int __sched wait_for_completion_killable(struct completion *x) | 4871 | int __sched wait_for_completion_killable(struct completion *x) |
4861 | { | 4872 | { |
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4874 | * This waits for either a completion of a specific task to be | 4885 | * This waits for either a completion of a specific task to be |
4875 | * signaled or for a specified timeout to expire. It can be | 4886 | * signaled or for a specified timeout to expire. It can be |
4876 | * interrupted by a kill signal. The timeout is in jiffies. | 4887 | * interrupted by a kill signal. The timeout is in jiffies. |
4888 | * | ||
4889 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
4890 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
4877 | */ | 4891 | */ |
4878 | long __sched | 4892 | long __sched |
4879 | wait_for_completion_killable_timeout(struct completion *x, | 4893 | wait_for_completion_killable_timeout(struct completion *x, |
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
6099 | */ | 6113 | */ |
6100 | idle->sched_class = &idle_sched_class; | 6114 | idle->sched_class = &idle_sched_class; |
6101 | ftrace_graph_init_idle_task(idle, cpu); | 6115 | ftrace_graph_init_idle_task(idle, cpu); |
6116 | #if defined(CONFIG_SMP) | ||
6117 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); | ||
6118 | #endif | ||
6102 | } | 6119 | } |
6103 | 6120 | ||
6104 | /* | 6121 | /* |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c9e67923b7c..a78ed2736ba7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
772 | list_del_leaf_cfs_rq(cfs_rq); | 772 | list_del_leaf_cfs_rq(cfs_rq); |
773 | } | 773 | } |
774 | 774 | ||
775 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | ||
776 | { | ||
777 | long tg_weight; | ||
778 | |||
779 | /* | ||
780 | * Use this CPU's actual weight instead of the last load_contribution | ||
781 | * to gain a more accurate current total weight. See | ||
782 | * update_cfs_rq_load_contribution(). | ||
783 | */ | ||
784 | tg_weight = atomic_read(&tg->load_weight); | ||
785 | tg_weight -= cfs_rq->load_contribution; | ||
786 | tg_weight += cfs_rq->load.weight; | ||
787 | |||
788 | return tg_weight; | ||
789 | } | ||
790 | |||
775 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 791 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
776 | { | 792 | { |
777 | long load_weight, load, shares; | 793 | long tg_weight, load, shares; |
778 | 794 | ||
795 | tg_weight = calc_tg_weight(tg, cfs_rq); | ||
779 | load = cfs_rq->load.weight; | 796 | load = cfs_rq->load.weight; |
780 | 797 | ||
781 | load_weight = atomic_read(&tg->load_weight); | ||
782 | load_weight += load; | ||
783 | load_weight -= cfs_rq->load_contribution; | ||
784 | |||
785 | shares = (tg->shares * load); | 798 | shares = (tg->shares * load); |
786 | if (load_weight) | 799 | if (tg_weight) |
787 | shares /= load_weight; | 800 | shares /= tg_weight; |
788 | 801 | ||
789 | if (shares < MIN_SHARES) | 802 | if (shares < MIN_SHARES) |
790 | shares = MIN_SHARES; | 803 | shares = MIN_SHARES; |
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
1743 | 1756 | ||
1744 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1757 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1745 | { | 1758 | { |
1746 | if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) | 1759 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) |
1747 | return; | 1760 | return; |
1748 | 1761 | ||
1749 | __return_cfs_rq_runtime(cfs_rq); | 1762 | __return_cfs_rq_runtime(cfs_rq); |
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p) | |||
2036 | * Adding load to a group doesn't make a group heavier, but can cause movement | 2049 | * Adding load to a group doesn't make a group heavier, but can cause movement |
2037 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 2050 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
2038 | * can calculate the shift in shares. | 2051 | * can calculate the shift in shares. |
2052 | * | ||
2053 | * Calculate the effective load difference if @wl is added (subtracted) to @tg | ||
2054 | * on this @cpu and results in a total addition (subtraction) of @wg to the | ||
2055 | * total group weight. | ||
2056 | * | ||
2057 | * Given a runqueue weight distribution (rw_i) we can compute a shares | ||
2058 | * distribution (s_i) using: | ||
2059 | * | ||
2060 | * s_i = rw_i / \Sum rw_j (1) | ||
2061 | * | ||
2062 | * Suppose we have 4 CPUs and our @tg is a direct child of the root group and | ||
2063 | * has 7 equal weight tasks, distributed as below (rw_i), with the resulting | ||
2064 | * shares distribution (s_i): | ||
2065 | * | ||
2066 | * rw_i = { 2, 4, 1, 0 } | ||
2067 | * s_i = { 2/7, 4/7, 1/7, 0 } | ||
2068 | * | ||
2069 | * As per wake_affine() we're interested in the load of two CPUs (the CPU the | ||
2070 | * task used to run on and the CPU the waker is running on), we need to | ||
2071 | * compute the effect of waking a task on either CPU and, in case of a sync | ||
2072 | * wakeup, compute the effect of the current task going to sleep. | ||
2073 | * | ||
2074 | * So for a change of @wl to the local @cpu with an overall group weight change | ||
2075 | * of @wl we can compute the new shares distribution (s'_i) using: | ||
2076 | * | ||
2077 | * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) | ||
2078 | * | ||
2079 | * Suppose we're interested in CPUs 0 and 1, and want to compute the load | ||
2080 | * differences in waking a task to CPU 0. The additional task changes the | ||
2081 | * weight and shares distributions like: | ||
2082 | * | ||
2083 | * rw'_i = { 3, 4, 1, 0 } | ||
2084 | * s'_i = { 3/8, 4/8, 1/8, 0 } | ||
2085 | * | ||
2086 | * We can then compute the difference in effective weight by using: | ||
2087 | * | ||
2088 | * dw_i = S * (s'_i - s_i) (3) | ||
2089 | * | ||
2090 | * Where 'S' is the group weight as seen by its parent. | ||
2091 | * | ||
2092 | * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) | ||
2093 | * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - | ||
2094 | * 4/7) times the weight of the group. | ||
2039 | */ | 2095 | */ |
2040 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | 2096 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
2041 | { | 2097 | { |
2042 | struct sched_entity *se = tg->se[cpu]; | 2098 | struct sched_entity *se = tg->se[cpu]; |
2043 | 2099 | ||
2044 | if (!tg->parent) | 2100 | if (!tg->parent) /* the trivial, non-cgroup case */ |
2045 | return wl; | 2101 | return wl; |
2046 | 2102 | ||
2047 | for_each_sched_entity(se) { | 2103 | for_each_sched_entity(se) { |
2048 | long lw, w; | 2104 | long w, W; |
2049 | 2105 | ||
2050 | tg = se->my_q->tg; | 2106 | tg = se->my_q->tg; |
2051 | w = se->my_q->load.weight; | ||
2052 | 2107 | ||
2053 | /* use this cpu's instantaneous contribution */ | 2108 | /* |
2054 | lw = atomic_read(&tg->load_weight); | 2109 | * W = @wg + \Sum rw_j |
2055 | lw -= se->my_q->load_contribution; | 2110 | */ |
2056 | lw += w + wg; | 2111 | W = wg + calc_tg_weight(tg, se->my_q); |
2057 | 2112 | ||
2058 | wl += w; | 2113 | /* |
2114 | * w = rw_i + @wl | ||
2115 | */ | ||
2116 | w = se->my_q->load.weight + wl; | ||
2059 | 2117 | ||
2060 | if (lw > 0 && wl < lw) | 2118 | /* |
2061 | wl = (wl * tg->shares) / lw; | 2119 | * wl = S * s'_i; see (2) |
2120 | */ | ||
2121 | if (W > 0 && w < W) | ||
2122 | wl = (w * tg->shares) / W; | ||
2062 | else | 2123 | else |
2063 | wl = tg->shares; | 2124 | wl = tg->shares; |
2064 | 2125 | ||
2065 | /* zero point is MIN_SHARES */ | 2126 | /* |
2127 | * Per the above, wl is the new se->load.weight value; since | ||
2128 | * those are clipped to [MIN_SHARES, ...) do so now. See | ||
2129 | * calc_cfs_shares(). | ||
2130 | */ | ||
2066 | if (wl < MIN_SHARES) | 2131 | if (wl < MIN_SHARES) |
2067 | wl = MIN_SHARES; | 2132 | wl = MIN_SHARES; |
2133 | |||
2134 | /* | ||
2135 | * wl = dw_i = S * (s'_i - s_i); see (3) | ||
2136 | */ | ||
2068 | wl -= se->load.weight; | 2137 | wl -= se->load.weight; |
2138 | |||
2139 | /* | ||
2140 | * Recursively apply this logic to all parent groups to compute | ||
2141 | * the final effective load change on the root group. Since | ||
2142 | * only the @tg group gets extra weight, all parent groups can | ||
2143 | * only redistribute existing shares. @wl is the shift in shares | ||
2144 | * resulting from this level per the above. | ||
2145 | */ | ||
2069 | wg = 0; | 2146 | wg = 0; |
2070 | } | 2147 | } |
2071 | 2148 | ||
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2249 | int cpu = smp_processor_id(); | 2326 | int cpu = smp_processor_id(); |
2250 | int prev_cpu = task_cpu(p); | 2327 | int prev_cpu = task_cpu(p); |
2251 | struct sched_domain *sd; | 2328 | struct sched_domain *sd; |
2252 | int i; | 2329 | struct sched_group *sg; |
2330 | int i, smt = 0; | ||
2253 | 2331 | ||
2254 | /* | 2332 | /* |
2255 | * If the task is going to be woken-up on this cpu and if it is | 2333 | * If the task is going to be woken-up on this cpu and if it is |
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2269 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2347 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2270 | */ | 2348 | */ |
2271 | rcu_read_lock(); | 2349 | rcu_read_lock(); |
2350 | again: | ||
2272 | for_each_domain(target, sd) { | 2351 | for_each_domain(target, sd) { |
2273 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 2352 | if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) |
2274 | break; | 2353 | continue; |
2275 | 2354 | ||
2276 | for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { | 2355 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) { |
2277 | if (idle_cpu(i)) { | 2356 | if (!smt) { |
2278 | target = i; | 2357 | smt = 1; |
2279 | break; | 2358 | goto again; |
2280 | } | 2359 | } |
2360 | break; | ||
2281 | } | 2361 | } |
2282 | 2362 | ||
2283 | /* | 2363 | sg = sd->groups; |
2284 | * Lets stop looking for an idle sibling when we reached | 2364 | do { |
2285 | * the domain that spans the current cpu and prev_cpu. | 2365 | if (!cpumask_intersects(sched_group_cpus(sg), |
2286 | */ | 2366 | tsk_cpus_allowed(p))) |
2287 | if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && | 2367 | goto next; |
2288 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 2368 | |
2289 | break; | 2369 | for_each_cpu(i, sched_group_cpus(sg)) { |
2370 | if (!idle_cpu(i)) | ||
2371 | goto next; | ||
2372 | } | ||
2373 | |||
2374 | target = cpumask_first_and(sched_group_cpus(sg), | ||
2375 | tsk_cpus_allowed(p)); | ||
2376 | goto done; | ||
2377 | next: | ||
2378 | sg = sg->next; | ||
2379 | } while (sg != sd->groups); | ||
2290 | } | 2380 | } |
2381 | done: | ||
2291 | rcu_read_unlock(); | 2382 | rcu_read_unlock(); |
2292 | 2383 | ||
2293 | return target; | 2384 | return target; |
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3511 | } | 3602 | } |
3512 | 3603 | ||
3513 | /** | 3604 | /** |
3514 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | 3605 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3515 | * @sd: sched_domain whose statistics are to be updated. | 3606 | * @sd: sched_domain whose statistics are to be updated. |
3516 | * @this_cpu: Cpu for which load balance is currently performed. | 3607 | * @this_cpu: Cpu for which load balance is currently performed. |
3517 | * @idle: Idle status of this_cpu | 3608 | * @idle: Idle status of this_cpu |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index efa0a7b75dde..84802245abd2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1) | |||
67 | SCHED_FEAT(TTWU_QUEUE, 1) | 67 | SCHED_FEAT(TTWU_QUEUE, 1) |
68 | 68 | ||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 056cbd2e2a27..583a1368afe6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq) | |||
560 | { | 560 | { |
561 | int more = 0; | 561 | int more = 0; |
562 | 562 | ||
563 | if (!sched_feat(RT_RUNTIME_SHARE)) | ||
564 | return more; | ||
565 | |||
563 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | 566 | if (rt_rq->rt_time > rt_rq->rt_runtime) { |
564 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 567 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
565 | more = do_balance_runtime(rt_rq); | 568 | more = do_balance_runtime(rt_rq); |