aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/timer.h23
-rw-r--r--include/linux/init_task.h4
-rw-r--r--kernel/sched.c17
-rw-r--r--kernel/sched_fair.c159
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_rt.c3
6 files changed, 171 insertions, 36 deletions
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fa7b9176b76c..431793e5d484 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -32,6 +32,22 @@ extern int no_timer_check;
32 * (mathieu.desnoyers@polymtl.ca) 32 * (mathieu.desnoyers@polymtl.ca)
33 * 33 *
34 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 34 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
35 *
36 * In:
37 *
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * Although we may still have enough bits to store the value of ns,
41 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
42 * leading to an incorrect result.
43 *
44 * To avoid this, we can decompose 'cycles' into quotient and remainder
45 * of division by SC. Then,
46 *
47 * ns = (quot * SC + rem) * cyc2ns_scale / SC
48 * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
49 *
50 * - sqazi@google.com
35 */ 51 */
36 52
37DECLARE_PER_CPU(unsigned long, cyc2ns); 53DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
41 57
42static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 58static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
43{ 59{
60 unsigned long long quot;
61 unsigned long long rem;
44 int cpu = smp_processor_id(); 62 int cpu = smp_processor_id();
45 unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 63 unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
46 ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; 64 quot = (cyc >> CYC2NS_SCALE_FACTOR);
65 rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
66 ns += quot * per_cpu(cyc2ns, cpu) +
67 ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
47 return ns; 68 return ns;
48} 69}
49 70
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 94b1e356c02a..32574eef9394 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,6 +126,8 @@ extern struct cred init_cred;
126# define INIT_PERF_EVENTS(tsk) 126# define INIT_PERF_EVENTS(tsk)
127#endif 127#endif
128 128
129#define INIT_TASK_COMM "swapper"
130
129/* 131/*
130 * INIT_TASK is used to set up the first task table, touch at 132 * INIT_TASK is used to set up the first task table, touch at
131 * your own risk!. Base=0, limit=0x1fffff (=2MB) 133 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -162,7 +164,7 @@ extern struct cred init_cred;
162 .group_leader = &tsk, \ 164 .group_leader = &tsk, \
163 RCU_INIT_POINTER(.real_cred, &init_cred), \ 165 RCU_INIT_POINTER(.real_cred, &init_cred), \
164 RCU_INIT_POINTER(.cred, &init_cred), \ 166 RCU_INIT_POINTER(.cred, &init_cred), \
165 .comm = "swapper", \ 167 .comm = INIT_TASK_COMM, \
166 .thread = INIT_THREAD, \ 168 .thread = INIT_THREAD, \
167 .fs = &init_fs, \ 169 .fs = &init_fs, \
168 .files = &init_files, \ 170 .files = &init_files, \
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a71be3..d6b149ccf925 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
71#include <linux/ctype.h> 71#include <linux/ctype.h>
72#include <linux/ftrace.h> 72#include <linux/ftrace.h>
73#include <linux/slab.h> 73#include <linux/slab.h>
74#include <linux/init_task.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
4810 * This waits for either a completion of a specific task to be signaled or for a 4811 * This waits for either a completion of a specific task to be signaled or for a
4811 * specified timeout to expire. The timeout is in jiffies. It is not 4812 * specified timeout to expire. The timeout is in jiffies. It is not
4812 * interruptible. 4813 * interruptible.
4814 *
4815 * The return value is 0 if timed out, and positive (at least 1, or number of
4816 * jiffies left till timeout) if completed.
4813 */ 4817 */
4814unsigned long __sched 4818unsigned long __sched
4815wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4819wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
4824 * 4828 *
4825 * This waits for completion of a specific task to be signaled. It is 4829 * This waits for completion of a specific task to be signaled. It is
4826 * interruptible. 4830 * interruptible.
4831 *
4832 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4827 */ 4833 */
4828int __sched wait_for_completion_interruptible(struct completion *x) 4834int __sched wait_for_completion_interruptible(struct completion *x)
4829{ 4835{
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4841 * 4847 *
4842 * This waits for either a completion of a specific task to be signaled or for a 4848 * This waits for either a completion of a specific task to be signaled or for a
4843 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4849 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4850 *
4851 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
4852 * positive (at least 1, or number of jiffies left till timeout) if completed.
4844 */ 4853 */
4845long __sched 4854long __sched
4846wait_for_completion_interruptible_timeout(struct completion *x, 4855wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4856 * 4865 *
4857 * This waits to be signaled for completion of a specific task. It can be 4866 * This waits to be signaled for completion of a specific task. It can be
4858 * interrupted by a kill signal. 4867 * interrupted by a kill signal.
4868 *
4869 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4859 */ 4870 */
4860int __sched wait_for_completion_killable(struct completion *x) 4871int __sched wait_for_completion_killable(struct completion *x)
4861{ 4872{
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4874 * This waits for either a completion of a specific task to be 4885 * This waits for either a completion of a specific task to be
4875 * signaled or for a specified timeout to expire. It can be 4886 * signaled or for a specified timeout to expire. It can be
4876 * interrupted by a kill signal. The timeout is in jiffies. 4887 * interrupted by a kill signal. The timeout is in jiffies.
4888 *
4889 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
4890 * positive (at least 1, or number of jiffies left till timeout) if completed.
4877 */ 4891 */
4878long __sched 4892long __sched
4879wait_for_completion_killable_timeout(struct completion *x, 4893wait_for_completion_killable_timeout(struct completion *x,
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6099 */ 6113 */
6100 idle->sched_class = &idle_sched_class; 6114 idle->sched_class = &idle_sched_class;
6101 ftrace_graph_init_idle_task(idle, cpu); 6115 ftrace_graph_init_idle_task(idle, cpu);
6116#if defined(CONFIG_SMP)
6117 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6118#endif
6102} 6119}
6103 6120
6104/* 6121/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c9e67923b7c..a78ed2736ba7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
772 list_del_leaf_cfs_rq(cfs_rq); 772 list_del_leaf_cfs_rq(cfs_rq);
773} 773}
774 774
775static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
776{
777 long tg_weight;
778
779 /*
780 * Use this CPU's actual weight instead of the last load_contribution
781 * to gain a more accurate current total weight. See
782 * update_cfs_rq_load_contribution().
783 */
784 tg_weight = atomic_read(&tg->load_weight);
785 tg_weight -= cfs_rq->load_contribution;
786 tg_weight += cfs_rq->load.weight;
787
788 return tg_weight;
789}
790
775static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 791static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
776{ 792{
777 long load_weight, load, shares; 793 long tg_weight, load, shares;
778 794
795 tg_weight = calc_tg_weight(tg, cfs_rq);
779 load = cfs_rq->load.weight; 796 load = cfs_rq->load.weight;
780 797
781 load_weight = atomic_read(&tg->load_weight);
782 load_weight += load;
783 load_weight -= cfs_rq->load_contribution;
784
785 shares = (tg->shares * load); 798 shares = (tg->shares * load);
786 if (load_weight) 799 if (tg_weight)
787 shares /= load_weight; 800 shares /= tg_weight;
788 801
789 if (shares < MIN_SHARES) 802 if (shares < MIN_SHARES)
790 shares = MIN_SHARES; 803 shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1743 1756
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{ 1758{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) 1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1747 return; 1760 return;
1748 1761
1749 __return_cfs_rq_runtime(cfs_rq); 1762 __return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
2036 * Adding load to a group doesn't make a group heavier, but can cause movement 2049 * Adding load to a group doesn't make a group heavier, but can cause movement
2037 * of group shares between cpus. Assuming the shares were perfectly aligned one 2050 * of group shares between cpus. Assuming the shares were perfectly aligned one
2038 * can calculate the shift in shares. 2051 * can calculate the shift in shares.
2052 *
2053 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2054 * on this @cpu and results in a total addition (subtraction) of @wg to the
2055 * total group weight.
2056 *
2057 * Given a runqueue weight distribution (rw_i) we can compute a shares
2058 * distribution (s_i) using:
2059 *
2060 * s_i = rw_i / \Sum rw_j (1)
2061 *
2062 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2063 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2064 * shares distribution (s_i):
2065 *
2066 * rw_i = { 2, 4, 1, 0 }
2067 * s_i = { 2/7, 4/7, 1/7, 0 }
2068 *
2069 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2070 * task used to run on and the CPU the waker is running on), we need to
2071 * compute the effect of waking a task on either CPU and, in case of a sync
2072 * wakeup, compute the effect of the current task going to sleep.
2073 *
2074 * So for a change of @wl to the local @cpu with an overall group weight change
2075 * of @wl we can compute the new shares distribution (s'_i) using:
2076 *
2077 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
2078 *
2079 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2080 * differences in waking a task to CPU 0. The additional task changes the
2081 * weight and shares distributions like:
2082 *
2083 * rw'_i = { 3, 4, 1, 0 }
2084 * s'_i = { 3/8, 4/8, 1/8, 0 }
2085 *
2086 * We can then compute the difference in effective weight by using:
2087 *
2088 * dw_i = S * (s'_i - s_i) (3)
2089 *
2090 * Where 'S' is the group weight as seen by its parent.
2091 *
2092 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2093 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2094 * 4/7) times the weight of the group.
2039 */ 2095 */
2040static long effective_load(struct task_group *tg, int cpu, long wl, long wg) 2096static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2041{ 2097{
2042 struct sched_entity *se = tg->se[cpu]; 2098 struct sched_entity *se = tg->se[cpu];
2043 2099
2044 if (!tg->parent) 2100 if (!tg->parent) /* the trivial, non-cgroup case */
2045 return wl; 2101 return wl;
2046 2102
2047 for_each_sched_entity(se) { 2103 for_each_sched_entity(se) {
2048 long lw, w; 2104 long w, W;
2049 2105
2050 tg = se->my_q->tg; 2106 tg = se->my_q->tg;
2051 w = se->my_q->load.weight;
2052 2107
2053 /* use this cpu's instantaneous contribution */ 2108 /*
2054 lw = atomic_read(&tg->load_weight); 2109 * W = @wg + \Sum rw_j
2055 lw -= se->my_q->load_contribution; 2110 */
2056 lw += w + wg; 2111 W = wg + calc_tg_weight(tg, se->my_q);
2057 2112
2058 wl += w; 2113 /*
2114 * w = rw_i + @wl
2115 */
2116 w = se->my_q->load.weight + wl;
2059 2117
2060 if (lw > 0 && wl < lw) 2118 /*
2061 wl = (wl * tg->shares) / lw; 2119 * wl = S * s'_i; see (2)
2120 */
2121 if (W > 0 && w < W)
2122 wl = (w * tg->shares) / W;
2062 else 2123 else
2063 wl = tg->shares; 2124 wl = tg->shares;
2064 2125
2065 /* zero point is MIN_SHARES */ 2126 /*
2127 * Per the above, wl is the new se->load.weight value; since
2128 * those are clipped to [MIN_SHARES, ...) do so now. See
2129 * calc_cfs_shares().
2130 */
2066 if (wl < MIN_SHARES) 2131 if (wl < MIN_SHARES)
2067 wl = MIN_SHARES; 2132 wl = MIN_SHARES;
2133
2134 /*
2135 * wl = dw_i = S * (s'_i - s_i); see (3)
2136 */
2068 wl -= se->load.weight; 2137 wl -= se->load.weight;
2138
2139 /*
2140 * Recursively apply this logic to all parent groups to compute
2141 * the final effective load change on the root group. Since
2142 * only the @tg group gets extra weight, all parent groups can
2143 * only redistribute existing shares. @wl is the shift in shares
2144 * resulting from this level per the above.
2145 */
2069 wg = 0; 2146 wg = 0;
2070 } 2147 }
2071 2148
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
2249 int cpu = smp_processor_id(); 2326 int cpu = smp_processor_id();
2250 int prev_cpu = task_cpu(p); 2327 int prev_cpu = task_cpu(p);
2251 struct sched_domain *sd; 2328 struct sched_domain *sd;
2252 int i; 2329 struct sched_group *sg;
2330 int i, smt = 0;
2253 2331
2254 /* 2332 /*
2255 * If the task is going to be woken-up on this cpu and if it is 2333 * If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
2269 * Otherwise, iterate the domains and find an elegible idle cpu. 2347 * Otherwise, iterate the domains and find an elegible idle cpu.
2270 */ 2348 */
2271 rcu_read_lock(); 2349 rcu_read_lock();
2350again:
2272 for_each_domain(target, sd) { 2351 for_each_domain(target, sd) {
2273 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2274 break; 2353 continue;
2275 2354
2276 for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { 2355 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
2277 if (idle_cpu(i)) { 2356 if (!smt) {
2278 target = i; 2357 smt = 1;
2279 break; 2358 goto again;
2280 } 2359 }
2360 break;
2281 } 2361 }
2282 2362
2283 /* 2363 sg = sd->groups;
2284 * Lets stop looking for an idle sibling when we reached 2364 do {
2285 * the domain that spans the current cpu and prev_cpu. 2365 if (!cpumask_intersects(sched_group_cpus(sg),
2286 */ 2366 tsk_cpus_allowed(p)))
2287 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && 2367 goto next;
2288 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 2368
2289 break; 2369 for_each_cpu(i, sched_group_cpus(sg)) {
2370 if (!idle_cpu(i))
2371 goto next;
2372 }
2373
2374 target = cpumask_first_and(sched_group_cpus(sg),
2375 tsk_cpus_allowed(p));
2376 goto done;
2377next:
2378 sg = sg->next;
2379 } while (sg != sd->groups);
2290 } 2380 }
2381done:
2291 rcu_read_unlock(); 2382 rcu_read_unlock();
2292 2383
2293 return target; 2384 return target;
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3511} 3602}
3512 3603
3513/** 3604/**
3514 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 3605 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3515 * @sd: sched_domain whose statistics are to be updated. 3606 * @sd: sched_domain whose statistics are to be updated.
3516 * @this_cpu: Cpu for which load balance is currently performed. 3607 * @this_cpu: Cpu for which load balance is currently performed.
3517 * @idle: Idle status of this_cpu 3608 * @idle: Idle status of this_cpu
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index efa0a7b75dde..84802245abd2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, 1)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, 0)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 056cbd2e2a27..583a1368afe6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
560{ 560{
561 int more = 0; 561 int more = 0;
562 562
563 if (!sched_feat(RT_RUNTIME_SHARE))
564 return more;
565
563 if (rt_rq->rt_time > rt_rq->rt_runtime) { 566 if (rt_rq->rt_time > rt_rq->rt_runtime) {
564 raw_spin_unlock(&rt_rq->rt_runtime_lock); 567 raw_spin_unlock(&rt_rq->rt_runtime_lock);
565 more = do_balance_runtime(rt_rq); 568 more = do_balance_runtime(rt_rq);