diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 637 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 5 | ||||
-rw-r--r-- | kernel/sched/debug.c | 37 | ||||
-rw-r--r-- | kernel/sched/fair.c | 175 | ||||
-rw-r--r-- | kernel/sched/proc.c | 591 | ||||
-rw-r--r-- | kernel/sched/rt.c | 132 | ||||
-rw-r--r-- | kernel/sched/sched.h | 71 | ||||
-rw-r--r-- | kernel/sched/stats.h | 47 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 8 |
11 files changed, 851 insertions, 857 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1de..54adcf35f495 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9e..4a073539c58e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) | |||
77 | if (IS_ERR(tg)) | 77 | if (IS_ERR(tg)) |
78 | goto out_free; | 78 | goto out_free; |
79 | 79 | ||
80 | sched_online_group(tg, &root_task_group); | ||
81 | |||
82 | kref_init(&ag->kref); | 80 | kref_init(&ag->kref); |
83 | init_rwsem(&ag->lock); | 81 | init_rwsem(&ag->lock); |
84 | ag->id = atomic_inc_return(&autogroup_seq_nr); | 82 | ag->id = atomic_inc_return(&autogroup_seq_nr); |
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) | |||
98 | #endif | 96 | #endif |
99 | tg->autogroup = ag; | 97 | tg->autogroup = ag; |
100 | 98 | ||
99 | sched_online_group(tg, &root_task_group); | ||
101 | return ag; | 100 | return ag; |
102 | 101 | ||
103 | out_free: | 102 | out_free: |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e8b335016c52..9b1f2e533b95 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -679,7 +679,7 @@ void sched_avg_update(struct rq *rq) | |||
679 | { | 679 | { |
680 | s64 period = sched_avg_period(); | 680 | s64 period = sched_avg_period(); |
681 | 681 | ||
682 | while ((s64)(rq->clock - rq->age_stamp) > period) { | 682 | while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { |
683 | /* | 683 | /* |
684 | * Inline assembly required to prevent the compiler | 684 | * Inline assembly required to prevent the compiler |
685 | * optimising this loop into a divmod call. | 685 | * optimising this loop into a divmod call. |
@@ -1340,7 +1340,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
1340 | p->sched_class->task_woken(rq, p); | 1340 | p->sched_class->task_woken(rq, p); |
1341 | 1341 | ||
1342 | if (rq->idle_stamp) { | 1342 | if (rq->idle_stamp) { |
1343 | u64 delta = rq->clock - rq->idle_stamp; | 1343 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
1344 | u64 max = 2*sysctl_sched_migration_cost; | 1344 | u64 max = 2*sysctl_sched_migration_cost; |
1345 | 1345 | ||
1346 | if (delta > max) | 1346 | if (delta > max) |
@@ -1377,6 +1377,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1377 | 1377 | ||
1378 | rq = __task_rq_lock(p); | 1378 | rq = __task_rq_lock(p); |
1379 | if (p->on_rq) { | 1379 | if (p->on_rq) { |
1380 | /* check_preempt_curr() may use rq clock */ | ||
1381 | update_rq_clock(rq); | ||
1380 | ttwu_do_wakeup(rq, p, wake_flags); | 1382 | ttwu_do_wakeup(rq, p, wake_flags); |
1381 | ret = 1; | 1383 | ret = 1; |
1382 | } | 1384 | } |
@@ -1609,15 +1611,6 @@ static void __sched_fork(struct task_struct *p) | |||
1609 | p->se.vruntime = 0; | 1611 | p->se.vruntime = 0; |
1610 | INIT_LIST_HEAD(&p->se.group_node); | 1612 | INIT_LIST_HEAD(&p->se.group_node); |
1611 | 1613 | ||
1612 | /* | ||
1613 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
1614 | * removed when useful for applications beyond shares distribution (e.g. | ||
1615 | * load-balance). | ||
1616 | */ | ||
1617 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1618 | p->se.avg.runnable_avg_period = 0; | ||
1619 | p->se.avg.runnable_avg_sum = 0; | ||
1620 | #endif | ||
1621 | #ifdef CONFIG_SCHEDSTATS | 1614 | #ifdef CONFIG_SCHEDSTATS |
1622 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1615 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1623 | #endif | 1616 | #endif |
@@ -1761,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p) | |||
1761 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1754 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
1762 | #endif | 1755 | #endif |
1763 | 1756 | ||
1757 | /* Initialize new task's runnable average */ | ||
1758 | init_task_runnable_average(p); | ||
1764 | rq = __task_rq_lock(p); | 1759 | rq = __task_rq_lock(p); |
1765 | activate_task(rq, p, 0); | 1760 | activate_task(rq, p, 0); |
1766 | p->on_rq = 1; | 1761 | p->on_rq = 1; |
@@ -2069,575 +2064,6 @@ unsigned long nr_iowait_cpu(int cpu) | |||
2069 | return atomic_read(&this->nr_iowait); | 2064 | return atomic_read(&this->nr_iowait); |
2070 | } | 2065 | } |
2071 | 2066 | ||
2072 | unsigned long this_cpu_load(void) | ||
2073 | { | ||
2074 | struct rq *this = this_rq(); | ||
2075 | return this->cpu_load[0]; | ||
2076 | } | ||
2077 | |||
2078 | |||
2079 | /* | ||
2080 | * Global load-average calculations | ||
2081 | * | ||
2082 | * We take a distributed and async approach to calculating the global load-avg | ||
2083 | * in order to minimize overhead. | ||
2084 | * | ||
2085 | * The global load average is an exponentially decaying average of nr_running + | ||
2086 | * nr_uninterruptible. | ||
2087 | * | ||
2088 | * Once every LOAD_FREQ: | ||
2089 | * | ||
2090 | * nr_active = 0; | ||
2091 | * for_each_possible_cpu(cpu) | ||
2092 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2093 | * | ||
2094 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2095 | * | ||
2096 | * Due to a number of reasons the above turns in the mess below: | ||
2097 | * | ||
2098 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2099 | * serious number of cpus, therefore we need to take a distributed approach | ||
2100 | * to calculating nr_active. | ||
2101 | * | ||
2102 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2103 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2104 | * | ||
2105 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2106 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2107 | * to obtain the same result. See calc_load_fold_active(). | ||
2108 | * | ||
2109 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2110 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2111 | * cpu to have completed this task. | ||
2112 | * | ||
2113 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2114 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2115 | * | ||
2116 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2117 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2118 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2119 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2120 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2121 | * all cpus yields the correct result. | ||
2122 | * | ||
2123 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2124 | */ | ||
2125 | |||
2126 | /* Variables and functions for calc_load */ | ||
2127 | static atomic_long_t calc_load_tasks; | ||
2128 | static unsigned long calc_load_update; | ||
2129 | unsigned long avenrun[3]; | ||
2130 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
2131 | |||
2132 | /** | ||
2133 | * get_avenrun - get the load average array | ||
2134 | * @loads: pointer to dest load array | ||
2135 | * @offset: offset to add | ||
2136 | * @shift: shift count to shift the result left | ||
2137 | * | ||
2138 | * These values are estimates at best, so no need for locking. | ||
2139 | */ | ||
2140 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2141 | { | ||
2142 | loads[0] = (avenrun[0] + offset) << shift; | ||
2143 | loads[1] = (avenrun[1] + offset) << shift; | ||
2144 | loads[2] = (avenrun[2] + offset) << shift; | ||
2145 | } | ||
2146 | |||
2147 | static long calc_load_fold_active(struct rq *this_rq) | ||
2148 | { | ||
2149 | long nr_active, delta = 0; | ||
2150 | |||
2151 | nr_active = this_rq->nr_running; | ||
2152 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2153 | |||
2154 | if (nr_active != this_rq->calc_load_active) { | ||
2155 | delta = nr_active - this_rq->calc_load_active; | ||
2156 | this_rq->calc_load_active = nr_active; | ||
2157 | } | ||
2158 | |||
2159 | return delta; | ||
2160 | } | ||
2161 | |||
2162 | /* | ||
2163 | * a1 = a0 * e + a * (1 - e) | ||
2164 | */ | ||
2165 | static unsigned long | ||
2166 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
2167 | { | ||
2168 | load *= exp; | ||
2169 | load += active * (FIXED_1 - exp); | ||
2170 | load += 1UL << (FSHIFT - 1); | ||
2171 | return load >> FSHIFT; | ||
2172 | } | ||
2173 | |||
2174 | #ifdef CONFIG_NO_HZ_COMMON | ||
2175 | /* | ||
2176 | * Handle NO_HZ for the global load-average. | ||
2177 | * | ||
2178 | * Since the above described distributed algorithm to compute the global | ||
2179 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2180 | * NO_HZ. | ||
2181 | * | ||
2182 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2183 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2184 | * when we read the global state. | ||
2185 | * | ||
2186 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2187 | * | ||
2188 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2189 | * contribution, causing under-accounting. | ||
2190 | * | ||
2191 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2192 | * when the window starts, thus separating old and new NO_HZ load. | ||
2193 | * | ||
2194 | * The only trick is the slight shift in index flip for read vs write. | ||
2195 | * | ||
2196 | * 0s 5s 10s 15s | ||
2197 | * +10 +10 +10 +10 | ||
2198 | * |-|-----------|-|-----------|-|-----------|-| | ||
2199 | * r:0 0 1 1 0 0 1 1 0 | ||
2200 | * w:0 1 1 0 0 1 1 0 0 | ||
2201 | * | ||
2202 | * This ensures we'll fold the old idle contribution in this window while | ||
2203 | * accumlating the new one. | ||
2204 | * | ||
2205 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2206 | * contribution, since we effectively move our sample point to a known | ||
2207 | * busy state. | ||
2208 | * | ||
2209 | * This is solved by pushing the window forward, and thus skipping the | ||
2210 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2211 | * was in effect at the time the window opened). This also solves the issue | ||
2212 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2213 | * LOAD_FREQ intervals. | ||
2214 | * | ||
2215 | * When making the ILB scale, we should try to pull this in as well. | ||
2216 | */ | ||
2217 | static atomic_long_t calc_load_idle[2]; | ||
2218 | static int calc_load_idx; | ||
2219 | |||
2220 | static inline int calc_load_write_idx(void) | ||
2221 | { | ||
2222 | int idx = calc_load_idx; | ||
2223 | |||
2224 | /* | ||
2225 | * See calc_global_nohz(), if we observe the new index, we also | ||
2226 | * need to observe the new update time. | ||
2227 | */ | ||
2228 | smp_rmb(); | ||
2229 | |||
2230 | /* | ||
2231 | * If the folding window started, make sure we start writing in the | ||
2232 | * next idle-delta. | ||
2233 | */ | ||
2234 | if (!time_before(jiffies, calc_load_update)) | ||
2235 | idx++; | ||
2236 | |||
2237 | return idx & 1; | ||
2238 | } | ||
2239 | |||
2240 | static inline int calc_load_read_idx(void) | ||
2241 | { | ||
2242 | return calc_load_idx & 1; | ||
2243 | } | ||
2244 | |||
2245 | void calc_load_enter_idle(void) | ||
2246 | { | ||
2247 | struct rq *this_rq = this_rq(); | ||
2248 | long delta; | ||
2249 | |||
2250 | /* | ||
2251 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2252 | * into the pending idle delta. | ||
2253 | */ | ||
2254 | delta = calc_load_fold_active(this_rq); | ||
2255 | if (delta) { | ||
2256 | int idx = calc_load_write_idx(); | ||
2257 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2258 | } | ||
2259 | } | ||
2260 | |||
2261 | void calc_load_exit_idle(void) | ||
2262 | { | ||
2263 | struct rq *this_rq = this_rq(); | ||
2264 | |||
2265 | /* | ||
2266 | * If we're still before the sample window, we're done. | ||
2267 | */ | ||
2268 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2269 | return; | ||
2270 | |||
2271 | /* | ||
2272 | * We woke inside or after the sample window, this means we're already | ||
2273 | * accounted through the nohz accounting, so skip the entire deal and | ||
2274 | * sync up for the next window. | ||
2275 | */ | ||
2276 | this_rq->calc_load_update = calc_load_update; | ||
2277 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
2278 | this_rq->calc_load_update += LOAD_FREQ; | ||
2279 | } | ||
2280 | |||
2281 | static long calc_load_fold_idle(void) | ||
2282 | { | ||
2283 | int idx = calc_load_read_idx(); | ||
2284 | long delta = 0; | ||
2285 | |||
2286 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2287 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2288 | |||
2289 | return delta; | ||
2290 | } | ||
2291 | |||
2292 | /** | ||
2293 | * fixed_power_int - compute: x^n, in O(log n) time | ||
2294 | * | ||
2295 | * @x: base of the power | ||
2296 | * @frac_bits: fractional bits of @x | ||
2297 | * @n: power to raise @x to. | ||
2298 | * | ||
2299 | * By exploiting the relation between the definition of the natural power | ||
2300 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
2301 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
2302 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
2303 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
2304 | * of course trivially computable in O(log_2 n), the length of our binary | ||
2305 | * vector. | ||
2306 | */ | ||
2307 | static unsigned long | ||
2308 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
2309 | { | ||
2310 | unsigned long result = 1UL << frac_bits; | ||
2311 | |||
2312 | if (n) for (;;) { | ||
2313 | if (n & 1) { | ||
2314 | result *= x; | ||
2315 | result += 1UL << (frac_bits - 1); | ||
2316 | result >>= frac_bits; | ||
2317 | } | ||
2318 | n >>= 1; | ||
2319 | if (!n) | ||
2320 | break; | ||
2321 | x *= x; | ||
2322 | x += 1UL << (frac_bits - 1); | ||
2323 | x >>= frac_bits; | ||
2324 | } | ||
2325 | |||
2326 | return result; | ||
2327 | } | ||
2328 | |||
2329 | /* | ||
2330 | * a1 = a0 * e + a * (1 - e) | ||
2331 | * | ||
2332 | * a2 = a1 * e + a * (1 - e) | ||
2333 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
2334 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
2335 | * | ||
2336 | * a3 = a2 * e + a * (1 - e) | ||
2337 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
2338 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
2339 | * | ||
2340 | * ... | ||
2341 | * | ||
2342 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
2343 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
2344 | * = a0 * e^n + a * (1 - e^n) | ||
2345 | * | ||
2346 | * [1] application of the geometric series: | ||
2347 | * | ||
2348 | * n 1 - x^(n+1) | ||
2349 | * S_n := \Sum x^i = ------------- | ||
2350 | * i=0 1 - x | ||
2351 | */ | ||
2352 | static unsigned long | ||
2353 | calc_load_n(unsigned long load, unsigned long exp, | ||
2354 | unsigned long active, unsigned int n) | ||
2355 | { | ||
2356 | |||
2357 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
2358 | } | ||
2359 | |||
2360 | /* | ||
2361 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
2362 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
2363 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
2364 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
2365 | * | ||
2366 | * Once we've updated the global active value, we need to apply the exponential | ||
2367 | * weights adjusted to the number of cycles missed. | ||
2368 | */ | ||
2369 | static void calc_global_nohz(void) | ||
2370 | { | ||
2371 | long delta, active, n; | ||
2372 | |||
2373 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
2374 | /* | ||
2375 | * Catch-up, fold however many we are behind still | ||
2376 | */ | ||
2377 | delta = jiffies - calc_load_update - 10; | ||
2378 | n = 1 + (delta / LOAD_FREQ); | ||
2379 | |||
2380 | active = atomic_long_read(&calc_load_tasks); | ||
2381 | active = active > 0 ? active * FIXED_1 : 0; | ||
2382 | |||
2383 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
2384 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
2385 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2386 | |||
2387 | calc_load_update += n * LOAD_FREQ; | ||
2388 | } | ||
2389 | |||
2390 | /* | ||
2391 | * Flip the idle index... | ||
2392 | * | ||
2393 | * Make sure we first write the new time then flip the index, so that | ||
2394 | * calc_load_write_idx() will see the new time when it reads the new | ||
2395 | * index, this avoids a double flip messing things up. | ||
2396 | */ | ||
2397 | smp_wmb(); | ||
2398 | calc_load_idx++; | ||
2399 | } | ||
2400 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
2401 | |||
2402 | static inline long calc_load_fold_idle(void) { return 0; } | ||
2403 | static inline void calc_global_nohz(void) { } | ||
2404 | |||
2405 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
2406 | |||
2407 | /* | ||
2408 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
2409 | * CPUs have updated calc_load_tasks. | ||
2410 | */ | ||
2411 | void calc_global_load(unsigned long ticks) | ||
2412 | { | ||
2413 | long active, delta; | ||
2414 | |||
2415 | if (time_before(jiffies, calc_load_update + 10)) | ||
2416 | return; | ||
2417 | |||
2418 | /* | ||
2419 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2420 | */ | ||
2421 | delta = calc_load_fold_idle(); | ||
2422 | if (delta) | ||
2423 | atomic_long_add(delta, &calc_load_tasks); | ||
2424 | |||
2425 | active = atomic_long_read(&calc_load_tasks); | ||
2426 | active = active > 0 ? active * FIXED_1 : 0; | ||
2427 | |||
2428 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
2429 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2430 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2431 | |||
2432 | calc_load_update += LOAD_FREQ; | ||
2433 | |||
2434 | /* | ||
2435 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
2436 | */ | ||
2437 | calc_global_nohz(); | ||
2438 | } | ||
2439 | |||
2440 | /* | ||
2441 | * Called from update_cpu_load() to periodically update this CPU's | ||
2442 | * active count. | ||
2443 | */ | ||
2444 | static void calc_load_account_active(struct rq *this_rq) | ||
2445 | { | ||
2446 | long delta; | ||
2447 | |||
2448 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2449 | return; | ||
2450 | |||
2451 | delta = calc_load_fold_active(this_rq); | ||
2452 | if (delta) | ||
2453 | atomic_long_add(delta, &calc_load_tasks); | ||
2454 | |||
2455 | this_rq->calc_load_update += LOAD_FREQ; | ||
2456 | } | ||
2457 | |||
2458 | /* | ||
2459 | * End of global load-average stuff | ||
2460 | */ | ||
2461 | |||
2462 | /* | ||
2463 | * The exact cpuload at various idx values, calculated at every tick would be | ||
2464 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
2465 | * | ||
2466 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
2467 | * on nth tick when cpu may be busy, then we have: | ||
2468 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
2469 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
2470 | * | ||
2471 | * decay_load_missed() below does efficient calculation of | ||
2472 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
2473 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
2474 | * | ||
2475 | * The calculation is approximated on a 128 point scale. | ||
2476 | * degrade_zero_ticks is the number of ticks after which load at any | ||
2477 | * particular idx is approximated to be zero. | ||
2478 | * degrade_factor is a precomputed table, a row for each load idx. | ||
2479 | * Each column corresponds to degradation factor for a power of two ticks, | ||
2480 | * based on 128 point scale. | ||
2481 | * Example: | ||
2482 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
2483 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
2484 | * | ||
2485 | * With this power of 2 load factors, we can degrade the load n times | ||
2486 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
2487 | * n mult/shifts needed by the exact degradation. | ||
2488 | */ | ||
2489 | #define DEGRADE_SHIFT 7 | ||
2490 | static const unsigned char | ||
2491 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
2492 | static const unsigned char | ||
2493 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
2494 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
2495 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
2496 | {96, 72, 40, 12, 1, 0, 0}, | ||
2497 | {112, 98, 75, 43, 15, 1, 0}, | ||
2498 | {120, 112, 98, 76, 45, 16, 2} }; | ||
2499 | |||
2500 | /* | ||
2501 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
2502 | * would be when CPU is idle and so we just decay the old load without | ||
2503 | * adding any new load. | ||
2504 | */ | ||
2505 | static unsigned long | ||
2506 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
2507 | { | ||
2508 | int j = 0; | ||
2509 | |||
2510 | if (!missed_updates) | ||
2511 | return load; | ||
2512 | |||
2513 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
2514 | return 0; | ||
2515 | |||
2516 | if (idx == 1) | ||
2517 | return load >> missed_updates; | ||
2518 | |||
2519 | while (missed_updates) { | ||
2520 | if (missed_updates % 2) | ||
2521 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
2522 | |||
2523 | missed_updates >>= 1; | ||
2524 | j++; | ||
2525 | } | ||
2526 | return load; | ||
2527 | } | ||
2528 | |||
2529 | /* | ||
2530 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
2531 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
2532 | * every tick. We fix it up based on jiffies. | ||
2533 | */ | ||
2534 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
2535 | unsigned long pending_updates) | ||
2536 | { | ||
2537 | int i, scale; | ||
2538 | |||
2539 | this_rq->nr_load_updates++; | ||
2540 | |||
2541 | /* Update our load: */ | ||
2542 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
2543 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
2544 | unsigned long old_load, new_load; | ||
2545 | |||
2546 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
2547 | |||
2548 | old_load = this_rq->cpu_load[i]; | ||
2549 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
2550 | new_load = this_load; | ||
2551 | /* | ||
2552 | * Round up the averaging division if load is increasing. This | ||
2553 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2554 | * example. | ||
2555 | */ | ||
2556 | if (new_load > old_load) | ||
2557 | new_load += scale - 1; | ||
2558 | |||
2559 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
2560 | } | ||
2561 | |||
2562 | sched_avg_update(this_rq); | ||
2563 | } | ||
2564 | |||
2565 | #ifdef CONFIG_NO_HZ_COMMON | ||
2566 | /* | ||
2567 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2568 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2569 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2570 | * | ||
2571 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2572 | * would seriously skew the load calculation. However we'll make do for those | ||
2573 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2574 | * (tick_nohz_idle_exit). | ||
2575 | * | ||
2576 | * This means we might still be one tick off for nohz periods. | ||
2577 | */ | ||
2578 | |||
2579 | /* | ||
2580 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2581 | * idle balance. | ||
2582 | */ | ||
2583 | void update_idle_cpu_load(struct rq *this_rq) | ||
2584 | { | ||
2585 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2586 | unsigned long load = this_rq->load.weight; | ||
2587 | unsigned long pending_updates; | ||
2588 | |||
2589 | /* | ||
2590 | * bail if there's load or we're actually up-to-date. | ||
2591 | */ | ||
2592 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2593 | return; | ||
2594 | |||
2595 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2596 | this_rq->last_load_update_tick = curr_jiffies; | ||
2597 | |||
2598 | __update_cpu_load(this_rq, load, pending_updates); | ||
2599 | } | ||
2600 | |||
2601 | /* | ||
2602 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2603 | */ | ||
2604 | void update_cpu_load_nohz(void) | ||
2605 | { | ||
2606 | struct rq *this_rq = this_rq(); | ||
2607 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2608 | unsigned long pending_updates; | ||
2609 | |||
2610 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2611 | return; | ||
2612 | |||
2613 | raw_spin_lock(&this_rq->lock); | ||
2614 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2615 | if (pending_updates) { | ||
2616 | this_rq->last_load_update_tick = curr_jiffies; | ||
2617 | /* | ||
2618 | * We were idle, this means load 0, the current load might be | ||
2619 | * !0 due to remote wakeups and the sort. | ||
2620 | */ | ||
2621 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2622 | } | ||
2623 | raw_spin_unlock(&this_rq->lock); | ||
2624 | } | ||
2625 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
2626 | |||
2627 | /* | ||
2628 | * Called from scheduler_tick() | ||
2629 | */ | ||
2630 | static void update_cpu_load_active(struct rq *this_rq) | ||
2631 | { | ||
2632 | /* | ||
2633 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2634 | */ | ||
2635 | this_rq->last_load_update_tick = jiffies; | ||
2636 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2637 | |||
2638 | calc_load_account_active(this_rq); | ||
2639 | } | ||
2640 | |||
2641 | #ifdef CONFIG_SMP | 2067 | #ifdef CONFIG_SMP |
2642 | 2068 | ||
2643 | /* | 2069 | /* |
@@ -2686,7 +2112,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
2686 | 2112 | ||
2687 | if (task_current(rq, p)) { | 2113 | if (task_current(rq, p)) { |
2688 | update_rq_clock(rq); | 2114 | update_rq_clock(rq); |
2689 | ns = rq->clock_task - p->se.exec_start; | 2115 | ns = rq_clock_task(rq) - p->se.exec_start; |
2690 | if ((s64)ns < 0) | 2116 | if ((s64)ns < 0) |
2691 | ns = 0; | 2117 | ns = 0; |
2692 | } | 2118 | } |
@@ -2739,8 +2165,8 @@ void scheduler_tick(void) | |||
2739 | 2165 | ||
2740 | raw_spin_lock(&rq->lock); | 2166 | raw_spin_lock(&rq->lock); |
2741 | update_rq_clock(rq); | 2167 | update_rq_clock(rq); |
2742 | update_cpu_load_active(rq); | ||
2743 | curr->sched_class->task_tick(rq, curr, 0); | 2168 | curr->sched_class->task_tick(rq, curr, 0); |
2169 | update_cpu_load_active(rq); | ||
2744 | raw_spin_unlock(&rq->lock); | 2170 | raw_spin_unlock(&rq->lock); |
2745 | 2171 | ||
2746 | perf_event_task_tick(); | 2172 | perf_event_task_tick(); |
@@ -4960,6 +4386,13 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
4960 | */ | 4386 | */ |
4961 | rq->stop = NULL; | 4387 | rq->stop = NULL; |
4962 | 4388 | ||
4389 | /* | ||
4390 | * put_prev_task() and pick_next_task() sched | ||
4391 | * class method both need to have an up-to-date | ||
4392 | * value of rq->clock[_task] | ||
4393 | */ | ||
4394 | update_rq_clock(rq); | ||
4395 | |||
4963 | for ( ; ; ) { | 4396 | for ( ; ; ) { |
4964 | /* | 4397 | /* |
4965 | * There's this thread running, bail when that's the only | 4398 | * There's this thread running, bail when that's the only |
@@ -5093,7 +4526,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5093 | return table; | 4526 | return table; |
5094 | } | 4527 | } |
5095 | 4528 | ||
5096 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | 4529 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
5097 | { | 4530 | { |
5098 | struct ctl_table *entry, *table; | 4531 | struct ctl_table *entry, *table; |
5099 | struct sched_domain *sd; | 4532 | struct sched_domain *sd; |
@@ -5907,7 +5340,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5907 | get_group(cpu, sdd, &sd->groups); | 5340 | get_group(cpu, sdd, &sd->groups); |
5908 | atomic_inc(&sd->groups->ref); | 5341 | atomic_inc(&sd->groups->ref); |
5909 | 5342 | ||
5910 | if (cpu != cpumask_first(sched_domain_span(sd))) | 5343 | if (cpu != cpumask_first(span)) |
5911 | return 0; | 5344 | return 0; |
5912 | 5345 | ||
5913 | lockdep_assert_held(&sched_domains_mutex); | 5346 | lockdep_assert_held(&sched_domains_mutex); |
@@ -5917,12 +5350,12 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5917 | 5350 | ||
5918 | for_each_cpu(i, span) { | 5351 | for_each_cpu(i, span) { |
5919 | struct sched_group *sg; | 5352 | struct sched_group *sg; |
5920 | int group = get_group(i, sdd, &sg); | 5353 | int group, j; |
5921 | int j; | ||
5922 | 5354 | ||
5923 | if (cpumask_test_cpu(i, covered)) | 5355 | if (cpumask_test_cpu(i, covered)) |
5924 | continue; | 5356 | continue; |
5925 | 5357 | ||
5358 | group = get_group(i, sdd, &sg); | ||
5926 | cpumask_clear(sched_group_cpus(sg)); | 5359 | cpumask_clear(sched_group_cpus(sg)); |
5927 | sg->sgp->power = 0; | 5360 | sg->sgp->power = 0; |
5928 | cpumask_setall(sched_group_mask(sg)); | 5361 | cpumask_setall(sched_group_mask(sg)); |
@@ -5960,7 +5393,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
5960 | { | 5393 | { |
5961 | struct sched_group *sg = sd->groups; | 5394 | struct sched_group *sg = sd->groups; |
5962 | 5395 | ||
5963 | WARN_ON(!sd || !sg); | 5396 | WARN_ON(!sg); |
5964 | 5397 | ||
5965 | do { | 5398 | do { |
5966 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | 5399 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); |
@@ -6125,6 +5558,9 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6125 | 5558 | ||
6126 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 5559 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6127 | 5560 | ||
5561 | #define for_each_sd_topology(tl) \ | ||
5562 | for (tl = sched_domain_topology; tl->init; tl++) | ||
5563 | |||
6128 | #ifdef CONFIG_NUMA | 5564 | #ifdef CONFIG_NUMA |
6129 | 5565 | ||
6130 | static int sched_domains_numa_levels; | 5566 | static int sched_domains_numa_levels; |
@@ -6422,7 +5858,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6422 | struct sched_domain_topology_level *tl; | 5858 | struct sched_domain_topology_level *tl; |
6423 | int j; | 5859 | int j; |
6424 | 5860 | ||
6425 | for (tl = sched_domain_topology; tl->init; tl++) { | 5861 | for_each_sd_topology(tl) { |
6426 | struct sd_data *sdd = &tl->data; | 5862 | struct sd_data *sdd = &tl->data; |
6427 | 5863 | ||
6428 | sdd->sd = alloc_percpu(struct sched_domain *); | 5864 | sdd->sd = alloc_percpu(struct sched_domain *); |
@@ -6475,7 +5911,7 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6475 | struct sched_domain_topology_level *tl; | 5911 | struct sched_domain_topology_level *tl; |
6476 | int j; | 5912 | int j; |
6477 | 5913 | ||
6478 | for (tl = sched_domain_topology; tl->init; tl++) { | 5914 | for_each_sd_topology(tl) { |
6479 | struct sd_data *sdd = &tl->data; | 5915 | struct sd_data *sdd = &tl->data; |
6480 | 5916 | ||
6481 | for_each_cpu(j, cpu_map) { | 5917 | for_each_cpu(j, cpu_map) { |
@@ -6503,9 +5939,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6503 | } | 5939 | } |
6504 | 5940 | ||
6505 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | 5941 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, |
6506 | struct s_data *d, const struct cpumask *cpu_map, | 5942 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6507 | struct sched_domain_attr *attr, struct sched_domain *child, | 5943 | struct sched_domain *child, int cpu) |
6508 | int cpu) | ||
6509 | { | 5944 | { |
6510 | struct sched_domain *sd = tl->init(tl, cpu); | 5945 | struct sched_domain *sd = tl->init(tl, cpu); |
6511 | if (!sd) | 5946 | if (!sd) |
@@ -6516,8 +5951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6516 | sd->level = child->level + 1; | 5951 | sd->level = child->level + 1; |
6517 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 5952 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6518 | child->parent = sd; | 5953 | child->parent = sd; |
5954 | sd->child = child; | ||
6519 | } | 5955 | } |
6520 | sd->child = child; | ||
6521 | set_domain_attribute(sd, attr); | 5956 | set_domain_attribute(sd, attr); |
6522 | 5957 | ||
6523 | return sd; | 5958 | return sd; |
@@ -6530,7 +5965,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6530 | static int build_sched_domains(const struct cpumask *cpu_map, | 5965 | static int build_sched_domains(const struct cpumask *cpu_map, |
6531 | struct sched_domain_attr *attr) | 5966 | struct sched_domain_attr *attr) |
6532 | { | 5967 | { |
6533 | enum s_alloc alloc_state = sa_none; | 5968 | enum s_alloc alloc_state; |
6534 | struct sched_domain *sd; | 5969 | struct sched_domain *sd; |
6535 | struct s_data d; | 5970 | struct s_data d; |
6536 | int i, ret = -ENOMEM; | 5971 | int i, ret = -ENOMEM; |
@@ -6544,18 +5979,15 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6544 | struct sched_domain_topology_level *tl; | 5979 | struct sched_domain_topology_level *tl; |
6545 | 5980 | ||
6546 | sd = NULL; | 5981 | sd = NULL; |
6547 | for (tl = sched_domain_topology; tl->init; tl++) { | 5982 | for_each_sd_topology(tl) { |
6548 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); | 5983 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); |
5984 | if (tl == sched_domain_topology) | ||
5985 | *per_cpu_ptr(d.sd, i) = sd; | ||
6549 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | 5986 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) |
6550 | sd->flags |= SD_OVERLAP; | 5987 | sd->flags |= SD_OVERLAP; |
6551 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | 5988 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) |
6552 | break; | 5989 | break; |
6553 | } | 5990 | } |
6554 | |||
6555 | while (sd->child) | ||
6556 | sd = sd->child; | ||
6557 | |||
6558 | *per_cpu_ptr(d.sd, i) = sd; | ||
6559 | } | 5991 | } |
6560 | 5992 | ||
6561 | /* Build the groups for the domains */ | 5993 | /* Build the groups for the domains */ |
@@ -6867,9 +6299,6 @@ void __init sched_init_smp(void) | |||
6867 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6299 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
6868 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); | 6300 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
6869 | 6301 | ||
6870 | /* RT runtime code needs to handle some hotplug events */ | ||
6871 | hotcpu_notifier(update_runtime, 0); | ||
6872 | |||
6873 | init_hrtick(); | 6302 | init_hrtick(); |
6874 | 6303 | ||
6875 | /* Move init over to a non-isolated CPU */ | 6304 | /* Move init over to a non-isolated CPU */ |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b5ccba22603b..a7959e05a9d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | |||
515 | 515 | ||
516 | for (;;) { | 516 | for (;;) { |
517 | /* Make sure "rtime" is the bigger of stime/rtime */ | 517 | /* Make sure "rtime" is the bigger of stime/rtime */ |
518 | if (stime > rtime) { | 518 | if (stime > rtime) |
519 | u64 tmp = rtime; rtime = stime; stime = tmp; | 519 | swap(rtime, stime); |
520 | } | ||
521 | 520 | ||
522 | /* Make sure 'total' fits in 32 bits */ | 521 | /* Make sure 'total' fits in 32 bits */ |
523 | if (total >> 32) | 522 | if (total >> 32) |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a673520..e076bddd4c66 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
209 | cfs_rq->nr_spread_over); | 209 | cfs_rq->nr_spread_over); |
210 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); | 210 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
211 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 211 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
212 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
213 | #ifdef CONFIG_SMP | 212 | #ifdef CONFIG_SMP |
214 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", | 213 | SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", |
215 | cfs_rq->runnable_load_avg); | 214 | cfs_rq->runnable_load_avg); |
216 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", | 215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
217 | cfs_rq->blocked_load_avg); | 216 | cfs_rq->blocked_load_avg); |
218 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", | 217 | #ifdef CONFIG_FAIR_GROUP_SCHED |
219 | (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); | 218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
220 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", | ||
221 | cfs_rq->tg_load_contrib); | 219 | cfs_rq->tg_load_contrib); |
222 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | 220 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", |
223 | cfs_rq->tg_runnable_contrib); | 221 | cfs_rq->tg_runnable_contrib); |
222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", | ||
223 | atomic_long_read(&cfs_rq->tg->load_avg)); | ||
224 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | 224 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", |
225 | atomic_read(&cfs_rq->tg->runnable_avg)); | 225 | atomic_read(&cfs_rq->tg->runnable_avg)); |
226 | #endif | 226 | #endif |
227 | #endif | ||
227 | 228 | ||
229 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
228 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 230 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
229 | #endif | 231 | #endif |
230 | } | 232 | } |
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
493 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, | 495 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, |
494 | get_nr_threads(p)); | 496 | get_nr_threads(p)); |
495 | SEQ_printf(m, | 497 | SEQ_printf(m, |
496 | "---------------------------------------------------------\n"); | 498 | "---------------------------------------------------------" |
499 | "----------\n"); | ||
497 | #define __P(F) \ | 500 | #define __P(F) \ |
498 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) | 501 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
499 | #define P(F) \ | 502 | #define P(F) \ |
500 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) | 503 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
501 | #define __PN(F) \ | 504 | #define __PN(F) \ |
502 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | 505 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
503 | #define PN(F) \ | 506 | #define PN(F) \ |
504 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | 507 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
505 | 508 | ||
506 | PN(se.exec_start); | 509 | PN(se.exec_start); |
507 | PN(se.vruntime); | 510 | PN(se.vruntime); |
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
560 | } | 563 | } |
561 | #endif | 564 | #endif |
562 | __P(nr_switches); | 565 | __P(nr_switches); |
563 | SEQ_printf(m, "%-35s:%21Ld\n", | 566 | SEQ_printf(m, "%-45s:%21Ld\n", |
564 | "nr_voluntary_switches", (long long)p->nvcsw); | 567 | "nr_voluntary_switches", (long long)p->nvcsw); |
565 | SEQ_printf(m, "%-35s:%21Ld\n", | 568 | SEQ_printf(m, "%-45s:%21Ld\n", |
566 | "nr_involuntary_switches", (long long)p->nivcsw); | 569 | "nr_involuntary_switches", (long long)p->nivcsw); |
567 | 570 | ||
568 | P(se.load.weight); | 571 | P(se.load.weight); |
572 | #ifdef CONFIG_SMP | ||
573 | P(se.avg.runnable_avg_sum); | ||
574 | P(se.avg.runnable_avg_period); | ||
575 | P(se.avg.load_avg_contrib); | ||
576 | P(se.avg.decay_count); | ||
577 | #endif | ||
569 | P(policy); | 578 | P(policy); |
570 | P(prio); | 579 | P(prio); |
571 | #undef PN | 580 | #undef PN |
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
579 | 588 | ||
580 | t0 = cpu_clock(this_cpu); | 589 | t0 = cpu_clock(this_cpu); |
581 | t1 = cpu_clock(this_cpu); | 590 | t1 = cpu_clock(this_cpu); |
582 | SEQ_printf(m, "%-35s:%21Ld\n", | 591 | SEQ_printf(m, "%-45s:%21Ld\n", |
583 | "clock-delta", (long long)(t1-t0)); | 592 | "clock-delta", (long long)(t1-t0)); |
584 | } | 593 | } |
585 | } | 594 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..f77f9c527449 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
113 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 113 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
114 | #endif | 114 | #endif |
115 | 115 | ||
116 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
117 | { | ||
118 | lw->weight += inc; | ||
119 | lw->inv_weight = 0; | ||
120 | } | ||
121 | |||
122 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
123 | { | ||
124 | lw->weight -= dec; | ||
125 | lw->inv_weight = 0; | ||
126 | } | ||
127 | |||
128 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
129 | { | ||
130 | lw->weight = w; | ||
131 | lw->inv_weight = 0; | ||
132 | } | ||
133 | |||
116 | /* | 134 | /* |
117 | * Increase the granularity value when there are more CPUs, | 135 | * Increase the granularity value when there are more CPUs, |
118 | * because with more CPUs the 'effective latency' as visible | 136 | * because with more CPUs the 'effective latency' as visible |
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
662 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 680 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
663 | } | 681 | } |
664 | 682 | ||
683 | #ifdef CONFIG_SMP | ||
684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | ||
685 | |||
686 | /* Give new task start runnable values to heavy its load in infant time */ | ||
687 | void init_task_runnable_average(struct task_struct *p) | ||
688 | { | ||
689 | u32 slice; | ||
690 | |||
691 | p->se.avg.decay_count = 0; | ||
692 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | ||
693 | p->se.avg.runnable_avg_sum = slice; | ||
694 | p->se.avg.runnable_avg_period = slice; | ||
695 | __update_task_entity_contrib(&p->se); | ||
696 | } | ||
697 | #else | ||
698 | void init_task_runnable_average(struct task_struct *p) | ||
699 | { | ||
700 | } | ||
701 | #endif | ||
702 | |||
665 | /* | 703 | /* |
666 | * Update the current task's runtime statistics. Skip current tasks that | 704 | * Update the current task's runtime statistics. Skip current tasks that |
667 | * are not in our scheduling class. | 705 | * are not in our scheduling class. |
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
686 | static void update_curr(struct cfs_rq *cfs_rq) | 724 | static void update_curr(struct cfs_rq *cfs_rq) |
687 | { | 725 | { |
688 | struct sched_entity *curr = cfs_rq->curr; | 726 | struct sched_entity *curr = cfs_rq->curr; |
689 | u64 now = rq_of(cfs_rq)->clock_task; | 727 | u64 now = rq_clock_task(rq_of(cfs_rq)); |
690 | unsigned long delta_exec; | 728 | unsigned long delta_exec; |
691 | 729 | ||
692 | if (unlikely(!curr)) | 730 | if (unlikely(!curr)) |
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
718 | static inline void | 756 | static inline void |
719 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 757 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
720 | { | 758 | { |
721 | schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); | 759 | schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); |
722 | } | 760 | } |
723 | 761 | ||
724 | /* | 762 | /* |
@@ -738,14 +776,14 @@ static void | |||
738 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 776 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
739 | { | 777 | { |
740 | schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, | 778 | schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, |
741 | rq_of(cfs_rq)->clock - se->statistics.wait_start)); | 779 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); |
742 | schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); | 780 | schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); |
743 | schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + | 781 | schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + |
744 | rq_of(cfs_rq)->clock - se->statistics.wait_start); | 782 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); |
745 | #ifdef CONFIG_SCHEDSTATS | 783 | #ifdef CONFIG_SCHEDSTATS |
746 | if (entity_is_task(se)) { | 784 | if (entity_is_task(se)) { |
747 | trace_sched_stat_wait(task_of(se), | 785 | trace_sched_stat_wait(task_of(se), |
748 | rq_of(cfs_rq)->clock - se->statistics.wait_start); | 786 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); |
749 | } | 787 | } |
750 | #endif | 788 | #endif |
751 | schedstat_set(se->statistics.wait_start, 0); | 789 | schedstat_set(se->statistics.wait_start, 0); |
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
771 | /* | 809 | /* |
772 | * We are starting a new run period: | 810 | * We are starting a new run period: |
773 | */ | 811 | */ |
774 | se->exec_start = rq_of(cfs_rq)->clock_task; | 812 | se->exec_start = rq_clock_task(rq_of(cfs_rq)); |
775 | } | 813 | } |
776 | 814 | ||
777 | /************************************************** | 815 | /************************************************** |
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
1037 | * to gain a more accurate current total weight. See | 1075 | * to gain a more accurate current total weight. See |
1038 | * update_cfs_rq_load_contribution(). | 1076 | * update_cfs_rq_load_contribution(). |
1039 | */ | 1077 | */ |
1040 | tg_weight = atomic64_read(&tg->load_avg); | 1078 | tg_weight = atomic_long_read(&tg->load_avg); |
1041 | tg_weight -= cfs_rq->tg_load_contrib; | 1079 | tg_weight -= cfs_rq->tg_load_contrib; |
1042 | tg_weight += cfs_rq->load.weight; | 1080 | tg_weight += cfs_rq->load.weight; |
1043 | 1081 | ||
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
1110 | } | 1148 | } |
1111 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 1149 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
1112 | 1150 | ||
1113 | /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ | 1151 | #ifdef CONFIG_SMP |
1114 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1115 | /* | 1152 | /* |
1116 | * We choose a half-life close to 1 scheduling period. | 1153 | * We choose a half-life close to 1 scheduling period. |
1117 | * Note: The tables below are dependent on this value. | 1154 | * Note: The tables below are dependent on this value. |
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
1319 | int force_update) | 1356 | int force_update) |
1320 | { | 1357 | { |
1321 | struct task_group *tg = cfs_rq->tg; | 1358 | struct task_group *tg = cfs_rq->tg; |
1322 | s64 tg_contrib; | 1359 | long tg_contrib; |
1323 | 1360 | ||
1324 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | 1361 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; |
1325 | tg_contrib -= cfs_rq->tg_load_contrib; | 1362 | tg_contrib -= cfs_rq->tg_load_contrib; |
1326 | 1363 | ||
1327 | if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | 1364 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { |
1328 | atomic64_add(tg_contrib, &tg->load_avg); | 1365 | atomic_long_add(tg_contrib, &tg->load_avg); |
1329 | cfs_rq->tg_load_contrib += tg_contrib; | 1366 | cfs_rq->tg_load_contrib += tg_contrib; |
1330 | } | 1367 | } |
1331 | } | 1368 | } |
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
1360 | u64 contrib; | 1397 | u64 contrib; |
1361 | 1398 | ||
1362 | contrib = cfs_rq->tg_load_contrib * tg->shares; | 1399 | contrib = cfs_rq->tg_load_contrib * tg->shares; |
1363 | se->avg.load_avg_contrib = div64_u64(contrib, | 1400 | se->avg.load_avg_contrib = div_u64(contrib, |
1364 | atomic64_read(&tg->load_avg) + 1); | 1401 | atomic_long_read(&tg->load_avg) + 1); |
1365 | 1402 | ||
1366 | /* | 1403 | /* |
1367 | * For group entities we need to compute a correction term in the case | 1404 | * For group entities we need to compute a correction term in the case |
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | |||
1480 | if (!decays && !force_update) | 1517 | if (!decays && !force_update) |
1481 | return; | 1518 | return; |
1482 | 1519 | ||
1483 | if (atomic64_read(&cfs_rq->removed_load)) { | 1520 | if (atomic_long_read(&cfs_rq->removed_load)) { |
1484 | u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); | 1521 | unsigned long removed_load; |
1522 | removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); | ||
1485 | subtract_blocked_load_contrib(cfs_rq, removed_load); | 1523 | subtract_blocked_load_contrib(cfs_rq, removed_load); |
1486 | } | 1524 | } |
1487 | 1525 | ||
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | |||
1497 | 1535 | ||
1498 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 1536 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
1499 | { | 1537 | { |
1500 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); | 1538 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); |
1501 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 1539 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
1502 | } | 1540 | } |
1503 | 1541 | ||
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1510 | * We track migrations using entity decay_count <= 0, on a wake-up | 1548 | * We track migrations using entity decay_count <= 0, on a wake-up |
1511 | * migration we use a negative decay count to track the remote decays | 1549 | * migration we use a negative decay count to track the remote decays |
1512 | * accumulated while sleeping. | 1550 | * accumulated while sleeping. |
1551 | * | ||
1552 | * Newly forked tasks are enqueued with se->avg.decay_count == 0, they | ||
1553 | * are seen by enqueue_entity_load_avg() as a migration with an already | ||
1554 | * constructed load_avg_contrib. | ||
1513 | */ | 1555 | */ |
1514 | if (unlikely(se->avg.decay_count <= 0)) { | 1556 | if (unlikely(se->avg.decay_count <= 0)) { |
1515 | se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; | 1557 | se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); |
1516 | if (se->avg.decay_count) { | 1558 | if (se->avg.decay_count) { |
1517 | /* | 1559 | /* |
1518 | * In a wake-up migration we have to approximate the | 1560 | * In a wake-up migration we have to approximate the |
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1530 | } | 1572 | } |
1531 | wakeup = 0; | 1573 | wakeup = 0; |
1532 | } else { | 1574 | } else { |
1533 | __synchronize_entity_decay(se); | 1575 | /* |
1576 | * Task re-woke on same cpu (or else migrate_task_rq_fair() | ||
1577 | * would have made count negative); we must be careful to avoid | ||
1578 | * double-accounting blocked time after synchronizing decays. | ||
1579 | */ | ||
1580 | se->avg.last_runnable_update += __synchronize_entity_decay(se) | ||
1581 | << 20; | ||
1534 | } | 1582 | } |
1535 | 1583 | ||
1536 | /* migrated tasks did not contribute to our blocked load */ | 1584 | /* migrated tasks did not contribute to our blocked load */ |
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1607 | tsk = task_of(se); | 1655 | tsk = task_of(se); |
1608 | 1656 | ||
1609 | if (se->statistics.sleep_start) { | 1657 | if (se->statistics.sleep_start) { |
1610 | u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; | 1658 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; |
1611 | 1659 | ||
1612 | if ((s64)delta < 0) | 1660 | if ((s64)delta < 0) |
1613 | delta = 0; | 1661 | delta = 0; |
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1624 | } | 1672 | } |
1625 | } | 1673 | } |
1626 | if (se->statistics.block_start) { | 1674 | if (se->statistics.block_start) { |
1627 | u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; | 1675 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; |
1628 | 1676 | ||
1629 | if ((s64)delta < 0) | 1677 | if ((s64)delta < 0) |
1630 | delta = 0; | 1678 | delta = 0; |
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1712 | { | 1760 | { |
1713 | /* | 1761 | /* |
1714 | * Update the normalized vruntime before updating min_vruntime | 1762 | * Update the normalized vruntime before updating min_vruntime |
1715 | * through callig update_curr(). | 1763 | * through calling update_curr(). |
1716 | */ | 1764 | */ |
1717 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) | 1765 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) |
1718 | se->vruntime += cfs_rq->min_vruntime; | 1766 | se->vruntime += cfs_rq->min_vruntime; |
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1805 | struct task_struct *tsk = task_of(se); | 1853 | struct task_struct *tsk = task_of(se); |
1806 | 1854 | ||
1807 | if (tsk->state & TASK_INTERRUPTIBLE) | 1855 | if (tsk->state & TASK_INTERRUPTIBLE) |
1808 | se->statistics.sleep_start = rq_of(cfs_rq)->clock; | 1856 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); |
1809 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 1857 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
1810 | se->statistics.block_start = rq_of(cfs_rq)->clock; | 1858 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); |
1811 | } | 1859 | } |
1812 | #endif | 1860 | #endif |
1813 | } | 1861 | } |
@@ -2082,7 +2130,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
2082 | if (unlikely(cfs_rq->throttle_count)) | 2130 | if (unlikely(cfs_rq->throttle_count)) |
2083 | return cfs_rq->throttled_clock_task; | 2131 | return cfs_rq->throttled_clock_task; |
2084 | 2132 | ||
2085 | return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; | 2133 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; |
2086 | } | 2134 | } |
2087 | 2135 | ||
2088 | /* returns 0 on failure to allocate runtime */ | 2136 | /* returns 0 on failure to allocate runtime */ |
@@ -2138,10 +2186,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
2138 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 2186 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
2139 | { | 2187 | { |
2140 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | 2188 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); |
2141 | struct rq *rq = rq_of(cfs_rq); | ||
2142 | 2189 | ||
2143 | /* if the deadline is ahead of our clock, nothing to do */ | 2190 | /* if the deadline is ahead of our clock, nothing to do */ |
2144 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | 2191 | if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) |
2145 | return; | 2192 | return; |
2146 | 2193 | ||
2147 | if (cfs_rq->runtime_remaining < 0) | 2194 | if (cfs_rq->runtime_remaining < 0) |
@@ -2230,7 +2277,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
2230 | #ifdef CONFIG_SMP | 2277 | #ifdef CONFIG_SMP |
2231 | if (!cfs_rq->throttle_count) { | 2278 | if (!cfs_rq->throttle_count) { |
2232 | /* adjust cfs_rq_clock_task() */ | 2279 | /* adjust cfs_rq_clock_task() */ |
2233 | cfs_rq->throttled_clock_task_time += rq->clock_task - | 2280 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
2234 | cfs_rq->throttled_clock_task; | 2281 | cfs_rq->throttled_clock_task; |
2235 | } | 2282 | } |
2236 | #endif | 2283 | #endif |
@@ -2245,7 +2292,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) | |||
2245 | 2292 | ||
2246 | /* group is entering throttled state, stop time */ | 2293 | /* group is entering throttled state, stop time */ |
2247 | if (!cfs_rq->throttle_count) | 2294 | if (!cfs_rq->throttle_count) |
2248 | cfs_rq->throttled_clock_task = rq->clock_task; | 2295 | cfs_rq->throttled_clock_task = rq_clock_task(rq); |
2249 | cfs_rq->throttle_count++; | 2296 | cfs_rq->throttle_count++; |
2250 | 2297 | ||
2251 | return 0; | 2298 | return 0; |
@@ -2284,7 +2331,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
2284 | rq->nr_running -= task_delta; | 2331 | rq->nr_running -= task_delta; |
2285 | 2332 | ||
2286 | cfs_rq->throttled = 1; | 2333 | cfs_rq->throttled = 1; |
2287 | cfs_rq->throttled_clock = rq->clock; | 2334 | cfs_rq->throttled_clock = rq_clock(rq); |
2288 | raw_spin_lock(&cfs_b->lock); | 2335 | raw_spin_lock(&cfs_b->lock); |
2289 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 2336 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
2290 | raw_spin_unlock(&cfs_b->lock); | 2337 | raw_spin_unlock(&cfs_b->lock); |
@@ -2298,15 +2345,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
2298 | int enqueue = 1; | 2345 | int enqueue = 1; |
2299 | long task_delta; | 2346 | long task_delta; |
2300 | 2347 | ||
2301 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | 2348 | se = cfs_rq->tg->se[cpu_of(rq)]; |
2302 | 2349 | ||
2303 | cfs_rq->throttled = 0; | 2350 | cfs_rq->throttled = 0; |
2351 | |||
2352 | update_rq_clock(rq); | ||
2353 | |||
2304 | raw_spin_lock(&cfs_b->lock); | 2354 | raw_spin_lock(&cfs_b->lock); |
2305 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; | 2355 | cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; |
2306 | list_del_rcu(&cfs_rq->throttled_list); | 2356 | list_del_rcu(&cfs_rq->throttled_list); |
2307 | raw_spin_unlock(&cfs_b->lock); | 2357 | raw_spin_unlock(&cfs_b->lock); |
2308 | 2358 | ||
2309 | update_rq_clock(rq); | ||
2310 | /* update hierarchical throttle state */ | 2359 | /* update hierarchical throttle state */ |
2311 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | 2360 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); |
2312 | 2361 | ||
@@ -2599,10 +2648,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
2599 | throttle_cfs_rq(cfs_rq); | 2648 | throttle_cfs_rq(cfs_rq); |
2600 | } | 2649 | } |
2601 | 2650 | ||
2602 | static inline u64 default_cfs_period(void); | ||
2603 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
2604 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
2605 | |||
2606 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | 2651 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) |
2607 | { | 2652 | { |
2608 | struct cfs_bandwidth *cfs_b = | 2653 | struct cfs_bandwidth *cfs_b = |
@@ -2706,7 +2751,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2706 | #else /* CONFIG_CFS_BANDWIDTH */ | 2751 | #else /* CONFIG_CFS_BANDWIDTH */ |
2707 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | 2752 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
2708 | { | 2753 | { |
2709 | return rq_of(cfs_rq)->clock_task; | 2754 | return rq_clock_task(rq_of(cfs_rq)); |
2710 | } | 2755 | } |
2711 | 2756 | ||
2712 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2757 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
@@ -2919,7 +2964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2919 | /* Used instead of source_load when we know the type == 0 */ | 2964 | /* Used instead of source_load when we know the type == 0 */ |
2920 | static unsigned long weighted_cpuload(const int cpu) | 2965 | static unsigned long weighted_cpuload(const int cpu) |
2921 | { | 2966 | { |
2922 | return cpu_rq(cpu)->load.weight; | 2967 | return cpu_rq(cpu)->cfs.runnable_load_avg; |
2923 | } | 2968 | } |
2924 | 2969 | ||
2925 | /* | 2970 | /* |
@@ -2964,9 +3009,10 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
2964 | { | 3009 | { |
2965 | struct rq *rq = cpu_rq(cpu); | 3010 | struct rq *rq = cpu_rq(cpu); |
2966 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 3011 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
3012 | unsigned long load_avg = rq->cfs.runnable_load_avg; | ||
2967 | 3013 | ||
2968 | if (nr_running) | 3014 | if (nr_running) |
2969 | return rq->load.weight / nr_running; | 3015 | return load_avg / nr_running; |
2970 | 3016 | ||
2971 | return 0; | 3017 | return 0; |
2972 | } | 3018 | } |
@@ -3416,12 +3462,6 @@ unlock: | |||
3416 | } | 3462 | } |
3417 | 3463 | ||
3418 | /* | 3464 | /* |
3419 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
3420 | * removed when useful for applications beyond shares distribution (e.g. | ||
3421 | * load-balance). | ||
3422 | */ | ||
3423 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
3424 | /* | ||
3425 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 3465 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and |
3426 | * cfs_rq_of(p) references at time of call are still valid and identify the | 3466 | * cfs_rq_of(p) references at time of call are still valid and identify the |
3427 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 3467 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no |
@@ -3441,10 +3481,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) | |||
3441 | */ | 3481 | */ |
3442 | if (se->avg.decay_count) { | 3482 | if (se->avg.decay_count) { |
3443 | se->avg.decay_count = -__synchronize_entity_decay(se); | 3483 | se->avg.decay_count = -__synchronize_entity_decay(se); |
3444 | atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); | 3484 | atomic_long_add(se->avg.load_avg_contrib, |
3485 | &cfs_rq->removed_load); | ||
3445 | } | 3486 | } |
3446 | } | 3487 | } |
3447 | #endif | ||
3448 | #endif /* CONFIG_SMP */ | 3488 | #endif /* CONFIG_SMP */ |
3449 | 3489 | ||
3450 | static unsigned long | 3490 | static unsigned long |
@@ -3946,7 +3986,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3946 | * 2) too many balance attempts have failed. | 3986 | * 2) too many balance attempts have failed. |
3947 | */ | 3987 | */ |
3948 | 3988 | ||
3949 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); | 3989 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
3950 | if (!tsk_cache_hot || | 3990 | if (!tsk_cache_hot || |
3951 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 3991 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3952 | 3992 | ||
@@ -4141,11 +4181,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
4141 | long cpu = (long)data; | 4181 | long cpu = (long)data; |
4142 | 4182 | ||
4143 | if (!tg->parent) { | 4183 | if (!tg->parent) { |
4144 | load = cpu_rq(cpu)->load.weight; | 4184 | load = cpu_rq(cpu)->avg.load_avg_contrib; |
4145 | } else { | 4185 | } else { |
4146 | load = tg->parent->cfs_rq[cpu]->h_load; | 4186 | load = tg->parent->cfs_rq[cpu]->h_load; |
4147 | load *= tg->se[cpu]->load.weight; | 4187 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, |
4148 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 4188 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); |
4149 | } | 4189 | } |
4150 | 4190 | ||
4151 | tg->cfs_rq[cpu]->h_load = load; | 4191 | tg->cfs_rq[cpu]->h_load = load; |
@@ -4171,12 +4211,9 @@ static void update_h_load(long cpu) | |||
4171 | static unsigned long task_h_load(struct task_struct *p) | 4211 | static unsigned long task_h_load(struct task_struct *p) |
4172 | { | 4212 | { |
4173 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4213 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
4174 | unsigned long load; | ||
4175 | |||
4176 | load = p->se.load.weight; | ||
4177 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); | ||
4178 | 4214 | ||
4179 | return load; | 4215 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
4216 | cfs_rq->runnable_load_avg + 1); | ||
4180 | } | 4217 | } |
4181 | #else | 4218 | #else |
4182 | static inline void update_blocked_averages(int cpu) | 4219 | static inline void update_blocked_averages(int cpu) |
@@ -4189,7 +4226,7 @@ static inline void update_h_load(long cpu) | |||
4189 | 4226 | ||
4190 | static unsigned long task_h_load(struct task_struct *p) | 4227 | static unsigned long task_h_load(struct task_struct *p) |
4191 | { | 4228 | { |
4192 | return p->se.load.weight; | 4229 | return p->se.avg.load_avg_contrib; |
4193 | } | 4230 | } |
4194 | #endif | 4231 | #endif |
4195 | 4232 | ||
@@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu) | |||
4302 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 4339 | age_stamp = ACCESS_ONCE(rq->age_stamp); |
4303 | avg = ACCESS_ONCE(rq->rt_avg); | 4340 | avg = ACCESS_ONCE(rq->rt_avg); |
4304 | 4341 | ||
4305 | total = sched_avg_period() + (rq->clock - age_stamp); | 4342 | total = sched_avg_period() + (rq_clock(rq) - age_stamp); |
4306 | 4343 | ||
4307 | if (unlikely(total < avg)) { | 4344 | if (unlikely(total < avg)) { |
4308 | /* Ensures that power won't end up being negative */ | 4345 | /* Ensures that power won't end up being negative */ |
@@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5241 | int pulled_task = 0; | 5278 | int pulled_task = 0; |
5242 | unsigned long next_balance = jiffies + HZ; | 5279 | unsigned long next_balance = jiffies + HZ; |
5243 | 5280 | ||
5244 | this_rq->idle_stamp = this_rq->clock; | 5281 | this_rq->idle_stamp = rq_clock(this_rq); |
5245 | 5282 | ||
5246 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5283 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
5247 | return; | 5284 | return; |
@@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu) | |||
5418 | static inline void set_cpu_sd_state_busy(void) | 5455 | static inline void set_cpu_sd_state_busy(void) |
5419 | { | 5456 | { |
5420 | struct sched_domain *sd; | 5457 | struct sched_domain *sd; |
5421 | int cpu = smp_processor_id(); | ||
5422 | 5458 | ||
5423 | rcu_read_lock(); | 5459 | rcu_read_lock(); |
5424 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); | 5460 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); |
5425 | 5461 | ||
5426 | if (!sd || !sd->nohz_idle) | 5462 | if (!sd || !sd->nohz_idle) |
5427 | goto unlock; | 5463 | goto unlock; |
@@ -5436,10 +5472,9 @@ unlock: | |||
5436 | void set_cpu_sd_state_idle(void) | 5472 | void set_cpu_sd_state_idle(void) |
5437 | { | 5473 | { |
5438 | struct sched_domain *sd; | 5474 | struct sched_domain *sd; |
5439 | int cpu = smp_processor_id(); | ||
5440 | 5475 | ||
5441 | rcu_read_lock(); | 5476 | rcu_read_lock(); |
5442 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); | 5477 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); |
5443 | 5478 | ||
5444 | if (!sd || sd->nohz_idle) | 5479 | if (!sd || sd->nohz_idle) |
5445 | goto unlock; | 5480 | goto unlock; |
@@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5848 | se->vruntime -= cfs_rq->min_vruntime; | 5883 | se->vruntime -= cfs_rq->min_vruntime; |
5849 | } | 5884 | } |
5850 | 5885 | ||
5851 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | 5886 | #ifdef CONFIG_SMP |
5852 | /* | 5887 | /* |
5853 | * Remove our load from contribution when we leave sched_fair | 5888 | * Remove our load from contribution when we leave sched_fair |
5854 | * and ensure we don't carry in an old decay_count if we | 5889 | * and ensure we don't carry in an old decay_count if we |
@@ -5907,9 +5942,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
5907 | #ifndef CONFIG_64BIT | 5942 | #ifndef CONFIG_64BIT |
5908 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5943 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
5909 | #endif | 5944 | #endif |
5910 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | 5945 | #ifdef CONFIG_SMP |
5911 | atomic64_set(&cfs_rq->decay_counter, 1); | 5946 | atomic64_set(&cfs_rq->decay_counter, 1); |
5912 | atomic64_set(&cfs_rq->removed_load, 0); | 5947 | atomic_long_set(&cfs_rq->removed_load, 0); |
5913 | #endif | 5948 | #endif |
5914 | } | 5949 | } |
5915 | 5950 | ||
@@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
6091 | se = tg->se[i]; | 6126 | se = tg->se[i]; |
6092 | /* Propagate contribution to hierarchy */ | 6127 | /* Propagate contribution to hierarchy */ |
6093 | raw_spin_lock_irqsave(&rq->lock, flags); | 6128 | raw_spin_lock_irqsave(&rq->lock, flags); |
6129 | |||
6130 | /* Possible calls to update_curr() need rq clock */ | ||
6131 | update_rq_clock(rq); | ||
6094 | for_each_sched_entity(se) | 6132 | for_each_sched_entity(se) |
6095 | update_cfs_shares(group_cfs_rq(se)); | 6133 | update_cfs_shares(group_cfs_rq(se)); |
6096 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6134 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
@@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = { | |||
6146 | 6184 | ||
6147 | #ifdef CONFIG_SMP | 6185 | #ifdef CONFIG_SMP |
6148 | .select_task_rq = select_task_rq_fair, | 6186 | .select_task_rq = select_task_rq_fair, |
6149 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6150 | .migrate_task_rq = migrate_task_rq_fair, | 6187 | .migrate_task_rq = migrate_task_rq_fair, |
6151 | #endif | 6188 | |
6152 | .rq_online = rq_online_fair, | 6189 | .rq_online = rq_online_fair, |
6153 | .rq_offline = rq_offline_fair, | 6190 | .rq_offline = rq_offline_fair, |
6154 | 6191 | ||
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 000000000000..16f5a30f9c88 --- /dev/null +++ b/kernel/sched/proc.c | |||
@@ -0,0 +1,591 @@ | |||
1 | /* | ||
2 | * kernel/sched/proc.c | ||
3 | * | ||
4 | * Kernel load calculations, forked from sched/core.c | ||
5 | */ | ||
6 | |||
7 | #include <linux/export.h> | ||
8 | |||
9 | #include "sched.h" | ||
10 | |||
11 | unsigned long this_cpu_load(void) | ||
12 | { | ||
13 | struct rq *this = this_rq(); | ||
14 | return this->cpu_load[0]; | ||
15 | } | ||
16 | |||
17 | |||
18 | /* | ||
19 | * Global load-average calculations | ||
20 | * | ||
21 | * We take a distributed and async approach to calculating the global load-avg | ||
22 | * in order to minimize overhead. | ||
23 | * | ||
24 | * The global load average is an exponentially decaying average of nr_running + | ||
25 | * nr_uninterruptible. | ||
26 | * | ||
27 | * Once every LOAD_FREQ: | ||
28 | * | ||
29 | * nr_active = 0; | ||
30 | * for_each_possible_cpu(cpu) | ||
31 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
32 | * | ||
33 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
34 | * | ||
35 | * Due to a number of reasons the above turns in the mess below: | ||
36 | * | ||
37 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
38 | * serious number of cpus, therefore we need to take a distributed approach | ||
39 | * to calculating nr_active. | ||
40 | * | ||
41 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
42 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
43 | * | ||
44 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
45 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
46 | * to obtain the same result. See calc_load_fold_active(). | ||
47 | * | ||
48 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
49 | * across the machine, we assume 10 ticks is sufficient time for every | ||
50 | * cpu to have completed this task. | ||
51 | * | ||
52 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
53 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
54 | * | ||
55 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
56 | * this would add another cross-cpu cacheline miss and atomic operation | ||
57 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
58 | * when it went into uninterruptible state and decrement on whatever cpu | ||
59 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
60 | * all cpus yields the correct result. | ||
61 | * | ||
62 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
63 | */ | ||
64 | |||
65 | /* Variables and functions for calc_load */ | ||
66 | atomic_long_t calc_load_tasks; | ||
67 | unsigned long calc_load_update; | ||
68 | unsigned long avenrun[3]; | ||
69 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
70 | |||
71 | /** | ||
72 | * get_avenrun - get the load average array | ||
73 | * @loads: pointer to dest load array | ||
74 | * @offset: offset to add | ||
75 | * @shift: shift count to shift the result left | ||
76 | * | ||
77 | * These values are estimates at best, so no need for locking. | ||
78 | */ | ||
79 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
80 | { | ||
81 | loads[0] = (avenrun[0] + offset) << shift; | ||
82 | loads[1] = (avenrun[1] + offset) << shift; | ||
83 | loads[2] = (avenrun[2] + offset) << shift; | ||
84 | } | ||
85 | |||
86 | long calc_load_fold_active(struct rq *this_rq) | ||
87 | { | ||
88 | long nr_active, delta = 0; | ||
89 | |||
90 | nr_active = this_rq->nr_running; | ||
91 | nr_active += (long) this_rq->nr_uninterruptible; | ||
92 | |||
93 | if (nr_active != this_rq->calc_load_active) { | ||
94 | delta = nr_active - this_rq->calc_load_active; | ||
95 | this_rq->calc_load_active = nr_active; | ||
96 | } | ||
97 | |||
98 | return delta; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * a1 = a0 * e + a * (1 - e) | ||
103 | */ | ||
104 | static unsigned long | ||
105 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
106 | { | ||
107 | load *= exp; | ||
108 | load += active * (FIXED_1 - exp); | ||
109 | load += 1UL << (FSHIFT - 1); | ||
110 | return load >> FSHIFT; | ||
111 | } | ||
112 | |||
113 | #ifdef CONFIG_NO_HZ_COMMON | ||
114 | /* | ||
115 | * Handle NO_HZ for the global load-average. | ||
116 | * | ||
117 | * Since the above described distributed algorithm to compute the global | ||
118 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
119 | * NO_HZ. | ||
120 | * | ||
121 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
123 | * when we read the global state. | ||
124 | * | ||
125 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
126 | * | ||
127 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
128 | * contribution, causing under-accounting. | ||
129 | * | ||
130 | * We avoid this by keeping two idle-delta counters and flipping them | ||
131 | * when the window starts, thus separating old and new NO_HZ load. | ||
132 | * | ||
133 | * The only trick is the slight shift in index flip for read vs write. | ||
134 | * | ||
135 | * 0s 5s 10s 15s | ||
136 | * +10 +10 +10 +10 | ||
137 | * |-|-----------|-|-----------|-|-----------|-| | ||
138 | * r:0 0 1 1 0 0 1 1 0 | ||
139 | * w:0 1 1 0 0 1 1 0 0 | ||
140 | * | ||
141 | * This ensures we'll fold the old idle contribution in this window while | ||
142 | * accumlating the new one. | ||
143 | * | ||
144 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
145 | * contribution, since we effectively move our sample point to a known | ||
146 | * busy state. | ||
147 | * | ||
148 | * This is solved by pushing the window forward, and thus skipping the | ||
149 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
150 | * was in effect at the time the window opened). This also solves the issue | ||
151 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
152 | * LOAD_FREQ intervals. | ||
153 | * | ||
154 | * When making the ILB scale, we should try to pull this in as well. | ||
155 | */ | ||
156 | static atomic_long_t calc_load_idle[2]; | ||
157 | static int calc_load_idx; | ||
158 | |||
159 | static inline int calc_load_write_idx(void) | ||
160 | { | ||
161 | int idx = calc_load_idx; | ||
162 | |||
163 | /* | ||
164 | * See calc_global_nohz(), if we observe the new index, we also | ||
165 | * need to observe the new update time. | ||
166 | */ | ||
167 | smp_rmb(); | ||
168 | |||
169 | /* | ||
170 | * If the folding window started, make sure we start writing in the | ||
171 | * next idle-delta. | ||
172 | */ | ||
173 | if (!time_before(jiffies, calc_load_update)) | ||
174 | idx++; | ||
175 | |||
176 | return idx & 1; | ||
177 | } | ||
178 | |||
179 | static inline int calc_load_read_idx(void) | ||
180 | { | ||
181 | return calc_load_idx & 1; | ||
182 | } | ||
183 | |||
184 | void calc_load_enter_idle(void) | ||
185 | { | ||
186 | struct rq *this_rq = this_rq(); | ||
187 | long delta; | ||
188 | |||
189 | /* | ||
190 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
191 | * into the pending idle delta. | ||
192 | */ | ||
193 | delta = calc_load_fold_active(this_rq); | ||
194 | if (delta) { | ||
195 | int idx = calc_load_write_idx(); | ||
196 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
197 | } | ||
198 | } | ||
199 | |||
200 | void calc_load_exit_idle(void) | ||
201 | { | ||
202 | struct rq *this_rq = this_rq(); | ||
203 | |||
204 | /* | ||
205 | * If we're still before the sample window, we're done. | ||
206 | */ | ||
207 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
208 | return; | ||
209 | |||
210 | /* | ||
211 | * We woke inside or after the sample window, this means we're already | ||
212 | * accounted through the nohz accounting, so skip the entire deal and | ||
213 | * sync up for the next window. | ||
214 | */ | ||
215 | this_rq->calc_load_update = calc_load_update; | ||
216 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
217 | this_rq->calc_load_update += LOAD_FREQ; | ||
218 | } | ||
219 | |||
220 | static long calc_load_fold_idle(void) | ||
221 | { | ||
222 | int idx = calc_load_read_idx(); | ||
223 | long delta = 0; | ||
224 | |||
225 | if (atomic_long_read(&calc_load_idle[idx])) | ||
226 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
227 | |||
228 | return delta; | ||
229 | } | ||
230 | |||
231 | /** | ||
232 | * fixed_power_int - compute: x^n, in O(log n) time | ||
233 | * | ||
234 | * @x: base of the power | ||
235 | * @frac_bits: fractional bits of @x | ||
236 | * @n: power to raise @x to. | ||
237 | * | ||
238 | * By exploiting the relation between the definition of the natural power | ||
239 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
240 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
241 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
242 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
243 | * of course trivially computable in O(log_2 n), the length of our binary | ||
244 | * vector. | ||
245 | */ | ||
246 | static unsigned long | ||
247 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
248 | { | ||
249 | unsigned long result = 1UL << frac_bits; | ||
250 | |||
251 | if (n) for (;;) { | ||
252 | if (n & 1) { | ||
253 | result *= x; | ||
254 | result += 1UL << (frac_bits - 1); | ||
255 | result >>= frac_bits; | ||
256 | } | ||
257 | n >>= 1; | ||
258 | if (!n) | ||
259 | break; | ||
260 | x *= x; | ||
261 | x += 1UL << (frac_bits - 1); | ||
262 | x >>= frac_bits; | ||
263 | } | ||
264 | |||
265 | return result; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * a1 = a0 * e + a * (1 - e) | ||
270 | * | ||
271 | * a2 = a1 * e + a * (1 - e) | ||
272 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
273 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
274 | * | ||
275 | * a3 = a2 * e + a * (1 - e) | ||
276 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
277 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
278 | * | ||
279 | * ... | ||
280 | * | ||
281 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
282 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
283 | * = a0 * e^n + a * (1 - e^n) | ||
284 | * | ||
285 | * [1] application of the geometric series: | ||
286 | * | ||
287 | * n 1 - x^(n+1) | ||
288 | * S_n := \Sum x^i = ------------- | ||
289 | * i=0 1 - x | ||
290 | */ | ||
291 | static unsigned long | ||
292 | calc_load_n(unsigned long load, unsigned long exp, | ||
293 | unsigned long active, unsigned int n) | ||
294 | { | ||
295 | |||
296 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
301 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
302 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
303 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
304 | * | ||
305 | * Once we've updated the global active value, we need to apply the exponential | ||
306 | * weights adjusted to the number of cycles missed. | ||
307 | */ | ||
308 | static void calc_global_nohz(void) | ||
309 | { | ||
310 | long delta, active, n; | ||
311 | |||
312 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
313 | /* | ||
314 | * Catch-up, fold however many we are behind still | ||
315 | */ | ||
316 | delta = jiffies - calc_load_update - 10; | ||
317 | n = 1 + (delta / LOAD_FREQ); | ||
318 | |||
319 | active = atomic_long_read(&calc_load_tasks); | ||
320 | active = active > 0 ? active * FIXED_1 : 0; | ||
321 | |||
322 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
323 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
324 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
325 | |||
326 | calc_load_update += n * LOAD_FREQ; | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Flip the idle index... | ||
331 | * | ||
332 | * Make sure we first write the new time then flip the index, so that | ||
333 | * calc_load_write_idx() will see the new time when it reads the new | ||
334 | * index, this avoids a double flip messing things up. | ||
335 | */ | ||
336 | smp_wmb(); | ||
337 | calc_load_idx++; | ||
338 | } | ||
339 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
340 | |||
341 | static inline long calc_load_fold_idle(void) { return 0; } | ||
342 | static inline void calc_global_nohz(void) { } | ||
343 | |||
344 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
345 | |||
346 | /* | ||
347 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
348 | * CPUs have updated calc_load_tasks. | ||
349 | */ | ||
350 | void calc_global_load(unsigned long ticks) | ||
351 | { | ||
352 | long active, delta; | ||
353 | |||
354 | if (time_before(jiffies, calc_load_update + 10)) | ||
355 | return; | ||
356 | |||
357 | /* | ||
358 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
359 | */ | ||
360 | delta = calc_load_fold_idle(); | ||
361 | if (delta) | ||
362 | atomic_long_add(delta, &calc_load_tasks); | ||
363 | |||
364 | active = atomic_long_read(&calc_load_tasks); | ||
365 | active = active > 0 ? active * FIXED_1 : 0; | ||
366 | |||
367 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
368 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
369 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
370 | |||
371 | calc_load_update += LOAD_FREQ; | ||
372 | |||
373 | /* | ||
374 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
375 | */ | ||
376 | calc_global_nohz(); | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Called from update_cpu_load() to periodically update this CPU's | ||
381 | * active count. | ||
382 | */ | ||
383 | static void calc_load_account_active(struct rq *this_rq) | ||
384 | { | ||
385 | long delta; | ||
386 | |||
387 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
388 | return; | ||
389 | |||
390 | delta = calc_load_fold_active(this_rq); | ||
391 | if (delta) | ||
392 | atomic_long_add(delta, &calc_load_tasks); | ||
393 | |||
394 | this_rq->calc_load_update += LOAD_FREQ; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * End of global load-average stuff | ||
399 | */ | ||
400 | |||
401 | /* | ||
402 | * The exact cpuload at various idx values, calculated at every tick would be | ||
403 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
404 | * | ||
405 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
406 | * on nth tick when cpu may be busy, then we have: | ||
407 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
408 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
409 | * | ||
410 | * decay_load_missed() below does efficient calculation of | ||
411 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
412 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
413 | * | ||
414 | * The calculation is approximated on a 128 point scale. | ||
415 | * degrade_zero_ticks is the number of ticks after which load at any | ||
416 | * particular idx is approximated to be zero. | ||
417 | * degrade_factor is a precomputed table, a row for each load idx. | ||
418 | * Each column corresponds to degradation factor for a power of two ticks, | ||
419 | * based on 128 point scale. | ||
420 | * Example: | ||
421 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
422 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
423 | * | ||
424 | * With this power of 2 load factors, we can degrade the load n times | ||
425 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
426 | * n mult/shifts needed by the exact degradation. | ||
427 | */ | ||
428 | #define DEGRADE_SHIFT 7 | ||
429 | static const unsigned char | ||
430 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
431 | static const unsigned char | ||
432 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
433 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
434 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
435 | {96, 72, 40, 12, 1, 0, 0}, | ||
436 | {112, 98, 75, 43, 15, 1, 0}, | ||
437 | {120, 112, 98, 76, 45, 16, 2} }; | ||
438 | |||
439 | /* | ||
440 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
441 | * would be when CPU is idle and so we just decay the old load without | ||
442 | * adding any new load. | ||
443 | */ | ||
444 | static unsigned long | ||
445 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
446 | { | ||
447 | int j = 0; | ||
448 | |||
449 | if (!missed_updates) | ||
450 | return load; | ||
451 | |||
452 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
453 | return 0; | ||
454 | |||
455 | if (idx == 1) | ||
456 | return load >> missed_updates; | ||
457 | |||
458 | while (missed_updates) { | ||
459 | if (missed_updates % 2) | ||
460 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
461 | |||
462 | missed_updates >>= 1; | ||
463 | j++; | ||
464 | } | ||
465 | return load; | ||
466 | } | ||
467 | |||
468 | /* | ||
469 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
470 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
471 | * every tick. We fix it up based on jiffies. | ||
472 | */ | ||
473 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
474 | unsigned long pending_updates) | ||
475 | { | ||
476 | int i, scale; | ||
477 | |||
478 | this_rq->nr_load_updates++; | ||
479 | |||
480 | /* Update our load: */ | ||
481 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
482 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
483 | unsigned long old_load, new_load; | ||
484 | |||
485 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
486 | |||
487 | old_load = this_rq->cpu_load[i]; | ||
488 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
489 | new_load = this_load; | ||
490 | /* | ||
491 | * Round up the averaging division if load is increasing. This | ||
492 | * prevents us from getting stuck on 9 if the load is 10, for | ||
493 | * example. | ||
494 | */ | ||
495 | if (new_load > old_load) | ||
496 | new_load += scale - 1; | ||
497 | |||
498 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
499 | } | ||
500 | |||
501 | sched_avg_update(this_rq); | ||
502 | } | ||
503 | |||
504 | #ifdef CONFIG_SMP | ||
505 | static inline unsigned long get_rq_runnable_load(struct rq *rq) | ||
506 | { | ||
507 | return rq->cfs.runnable_load_avg; | ||
508 | } | ||
509 | #else | ||
510 | static inline unsigned long get_rq_runnable_load(struct rq *rq) | ||
511 | { | ||
512 | return rq->load.weight; | ||
513 | } | ||
514 | #endif | ||
515 | |||
516 | #ifdef CONFIG_NO_HZ_COMMON | ||
517 | /* | ||
518 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
519 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
520 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
521 | * | ||
522 | * Therefore we cannot use the delta approach from the regular tick since that | ||
523 | * would seriously skew the load calculation. However we'll make do for those | ||
524 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
525 | * (tick_nohz_idle_exit). | ||
526 | * | ||
527 | * This means we might still be one tick off for nohz periods. | ||
528 | */ | ||
529 | |||
530 | /* | ||
531 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
532 | * idle balance. | ||
533 | */ | ||
534 | void update_idle_cpu_load(struct rq *this_rq) | ||
535 | { | ||
536 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
537 | unsigned long load = get_rq_runnable_load(this_rq); | ||
538 | unsigned long pending_updates; | ||
539 | |||
540 | /* | ||
541 | * bail if there's load or we're actually up-to-date. | ||
542 | */ | ||
543 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
544 | return; | ||
545 | |||
546 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
547 | this_rq->last_load_update_tick = curr_jiffies; | ||
548 | |||
549 | __update_cpu_load(this_rq, load, pending_updates); | ||
550 | } | ||
551 | |||
552 | /* | ||
553 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
554 | */ | ||
555 | void update_cpu_load_nohz(void) | ||
556 | { | ||
557 | struct rq *this_rq = this_rq(); | ||
558 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
559 | unsigned long pending_updates; | ||
560 | |||
561 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
562 | return; | ||
563 | |||
564 | raw_spin_lock(&this_rq->lock); | ||
565 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
566 | if (pending_updates) { | ||
567 | this_rq->last_load_update_tick = curr_jiffies; | ||
568 | /* | ||
569 | * We were idle, this means load 0, the current load might be | ||
570 | * !0 due to remote wakeups and the sort. | ||
571 | */ | ||
572 | __update_cpu_load(this_rq, 0, pending_updates); | ||
573 | } | ||
574 | raw_spin_unlock(&this_rq->lock); | ||
575 | } | ||
576 | #endif /* CONFIG_NO_HZ */ | ||
577 | |||
578 | /* | ||
579 | * Called from scheduler_tick() | ||
580 | */ | ||
581 | void update_cpu_load_active(struct rq *this_rq) | ||
582 | { | ||
583 | unsigned long load = get_rq_runnable_load(this_rq); | ||
584 | /* | ||
585 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
586 | */ | ||
587 | this_rq->last_load_update_tick = jiffies; | ||
588 | __update_cpu_load(this_rq, load, 1); | ||
589 | |||
590 | calc_load_account_active(this_rq); | ||
591 | } | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4ab..01970c8e64df 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg) | |||
399 | (iter = next_task_group(iter)) && \ | 399 | (iter = next_task_group(iter)) && \ |
400 | (rt_rq = iter->rt_rq[cpu_of(rq)]);) | 400 | (rt_rq = iter->rt_rq[cpu_of(rq)]);) |
401 | 401 | ||
402 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
403 | { | ||
404 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | ||
405 | &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); | ||
406 | } | ||
407 | |||
408 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
409 | { | ||
410 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
411 | } | ||
412 | |||
413 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
414 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
415 | |||
416 | #define for_each_sched_rt_entity(rt_se) \ | 402 | #define for_each_sched_rt_entity(rt_se) \ |
417 | for (; rt_se; rt_se = rt_se->parent) | 403 | for (; rt_se; rt_se = rt_se->parent) |
418 | 404 | ||
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) | |||
472 | #ifdef CONFIG_SMP | 458 | #ifdef CONFIG_SMP |
473 | static inline const struct cpumask *sched_rt_period_mask(void) | 459 | static inline const struct cpumask *sched_rt_period_mask(void) |
474 | { | 460 | { |
475 | return cpu_rq(smp_processor_id())->rd->span; | 461 | return this_rq()->rd->span; |
476 | } | 462 | } |
477 | #else | 463 | #else |
478 | static inline const struct cpumask *sched_rt_period_mask(void) | 464 | static inline const struct cpumask *sched_rt_period_mask(void) |
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t; | |||
509 | #define for_each_rt_rq(rt_rq, iter, rq) \ | 495 | #define for_each_rt_rq(rt_rq, iter, rq) \ |
510 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | 496 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) |
511 | 497 | ||
512 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
513 | { | ||
514 | } | ||
515 | |||
516 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
517 | { | ||
518 | } | ||
519 | |||
520 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
521 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
522 | |||
523 | #define for_each_sched_rt_entity(rt_se) \ | 498 | #define for_each_sched_rt_entity(rt_se) \ |
524 | for (; rt_se; rt_se = NULL) | 499 | for (; rt_se; rt_se = NULL) |
525 | 500 | ||
@@ -699,15 +674,6 @@ balanced: | |||
699 | } | 674 | } |
700 | } | 675 | } |
701 | 676 | ||
702 | static void disable_runtime(struct rq *rq) | ||
703 | { | ||
704 | unsigned long flags; | ||
705 | |||
706 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
707 | __disable_runtime(rq); | ||
708 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
709 | } | ||
710 | |||
711 | static void __enable_runtime(struct rq *rq) | 677 | static void __enable_runtime(struct rq *rq) |
712 | { | 678 | { |
713 | rt_rq_iter_t iter; | 679 | rt_rq_iter_t iter; |
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq) | |||
732 | } | 698 | } |
733 | } | 699 | } |
734 | 700 | ||
735 | static void enable_runtime(struct rq *rq) | ||
736 | { | ||
737 | unsigned long flags; | ||
738 | |||
739 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
740 | __enable_runtime(rq); | ||
741 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
742 | } | ||
743 | |||
744 | int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
745 | { | ||
746 | int cpu = (int)(long)hcpu; | ||
747 | |||
748 | switch (action) { | ||
749 | case CPU_DOWN_PREPARE: | ||
750 | case CPU_DOWN_PREPARE_FROZEN: | ||
751 | disable_runtime(cpu_rq(cpu)); | ||
752 | return NOTIFY_OK; | ||
753 | |||
754 | case CPU_DOWN_FAILED: | ||
755 | case CPU_DOWN_FAILED_FROZEN: | ||
756 | case CPU_ONLINE: | ||
757 | case CPU_ONLINE_FROZEN: | ||
758 | enable_runtime(cpu_rq(cpu)); | ||
759 | return NOTIFY_OK; | ||
760 | |||
761 | default: | ||
762 | return NOTIFY_DONE; | ||
763 | } | ||
764 | } | ||
765 | |||
766 | static int balance_runtime(struct rt_rq *rt_rq) | 701 | static int balance_runtime(struct rt_rq *rt_rq) |
767 | { | 702 | { |
768 | int more = 0; | 703 | int more = 0; |
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq) | |||
926 | if (curr->sched_class != &rt_sched_class) | 861 | if (curr->sched_class != &rt_sched_class) |
927 | return; | 862 | return; |
928 | 863 | ||
929 | delta_exec = rq->clock_task - curr->se.exec_start; | 864 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
930 | if (unlikely((s64)delta_exec <= 0)) | 865 | if (unlikely((s64)delta_exec <= 0)) |
931 | return; | 866 | return; |
932 | 867 | ||
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq) | |||
936 | curr->se.sum_exec_runtime += delta_exec; | 871 | curr->se.sum_exec_runtime += delta_exec; |
937 | account_group_exec_runtime(curr, delta_exec); | 872 | account_group_exec_runtime(curr, delta_exec); |
938 | 873 | ||
939 | curr->se.exec_start = rq->clock_task; | 874 | curr->se.exec_start = rq_clock_task(rq); |
940 | cpuacct_charge(curr, delta_exec); | 875 | cpuacct_charge(curr, delta_exec); |
941 | 876 | ||
942 | sched_rt_avg_update(rq, delta_exec); | 877 | sched_rt_avg_update(rq, delta_exec); |
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
1106 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 1041 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
1107 | return; | 1042 | return; |
1108 | 1043 | ||
1109 | if (!rt_rq->rt_nr_running) | ||
1110 | list_add_leaf_rt_rq(rt_rq); | ||
1111 | |||
1112 | if (head) | 1044 | if (head) |
1113 | list_add(&rt_se->run_list, queue); | 1045 | list_add(&rt_se->run_list, queue); |
1114 | else | 1046 | else |
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
1128 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 1060 | __clear_bit(rt_se_prio(rt_se), array->bitmap); |
1129 | 1061 | ||
1130 | dec_rt_tasks(rt_se, rt_rq); | 1062 | dec_rt_tasks(rt_se, rt_rq); |
1131 | if (!rt_rq->rt_nr_running) | ||
1132 | list_del_leaf_rt_rq(rt_rq); | ||
1133 | } | 1063 | } |
1134 | 1064 | ||
1135 | /* | 1065 | /* |
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1385 | } while (rt_rq); | 1315 | } while (rt_rq); |
1386 | 1316 | ||
1387 | p = rt_task_of(rt_se); | 1317 | p = rt_task_of(rt_se); |
1388 | p->se.exec_start = rq->clock_task; | 1318 | p->se.exec_start = rq_clock_task(rq); |
1389 | 1319 | ||
1390 | return p; | 1320 | return p; |
1391 | } | 1321 | } |
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1434 | return 0; | 1364 | return 0; |
1435 | } | 1365 | } |
1436 | 1366 | ||
1437 | /* Return the second highest RT task, NULL otherwise */ | 1367 | /* |
1438 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | 1368 | * Return the highest pushable rq's task, which is suitable to be executed |
1369 | * on the cpu, NULL otherwise | ||
1370 | */ | ||
1371 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) | ||
1439 | { | 1372 | { |
1440 | struct task_struct *next = NULL; | 1373 | struct plist_head *head = &rq->rt.pushable_tasks; |
1441 | struct sched_rt_entity *rt_se; | 1374 | struct task_struct *p; |
1442 | struct rt_prio_array *array; | ||
1443 | struct rt_rq *rt_rq; | ||
1444 | int idx; | ||
1445 | |||
1446 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
1447 | array = &rt_rq->active; | ||
1448 | idx = sched_find_first_bit(array->bitmap); | ||
1449 | next_idx: | ||
1450 | if (idx >= MAX_RT_PRIO) | ||
1451 | continue; | ||
1452 | if (next && next->prio <= idx) | ||
1453 | continue; | ||
1454 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
1455 | struct task_struct *p; | ||
1456 | 1375 | ||
1457 | if (!rt_entity_is_task(rt_se)) | 1376 | if (!has_pushable_tasks(rq)) |
1458 | continue; | 1377 | return NULL; |
1459 | 1378 | ||
1460 | p = rt_task_of(rt_se); | 1379 | plist_for_each_entry(p, head, pushable_tasks) { |
1461 | if (pick_rt_task(rq, p, cpu)) { | 1380 | if (pick_rt_task(rq, p, cpu)) |
1462 | next = p; | 1381 | return p; |
1463 | break; | ||
1464 | } | ||
1465 | } | ||
1466 | if (!next) { | ||
1467 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
1468 | goto next_idx; | ||
1469 | } | ||
1470 | } | 1382 | } |
1471 | 1383 | ||
1472 | return next; | 1384 | return NULL; |
1473 | } | 1385 | } |
1474 | 1386 | ||
1475 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); | 1387 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); |
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq) | |||
1743 | double_lock_balance(this_rq, src_rq); | 1655 | double_lock_balance(this_rq, src_rq); |
1744 | 1656 | ||
1745 | /* | 1657 | /* |
1746 | * Are there still pullable RT tasks? | 1658 | * We can pull only a task, which is pushable |
1659 | * on its rq, and no others. | ||
1747 | */ | 1660 | */ |
1748 | if (src_rq->rt.rt_nr_running <= 1) | 1661 | p = pick_highest_pushable_task(src_rq, this_cpu); |
1749 | goto skip; | ||
1750 | |||
1751 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
1752 | 1662 | ||
1753 | /* | 1663 | /* |
1754 | * Do we have an RT task that preempts | 1664 | * Do we have an RT task that preempts |
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
2037 | { | 1947 | { |
2038 | struct task_struct *p = rq->curr; | 1948 | struct task_struct *p = rq->curr; |
2039 | 1949 | ||
2040 | p->se.exec_start = rq->clock_task; | 1950 | p->se.exec_start = rq_clock_task(rq); |
2041 | 1951 | ||
2042 | /* The running task is never eligible for pushing */ | 1952 | /* The running task is never eligible for pushing */ |
2043 | dequeue_pushable_task(rq, p); | 1953 | dequeue_pushable_task(rq, p); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d6155..ef0a7b2439dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -10,8 +10,16 @@ | |||
10 | #include "cpupri.h" | 10 | #include "cpupri.h" |
11 | #include "cpuacct.h" | 11 | #include "cpuacct.h" |
12 | 12 | ||
13 | struct rq; | ||
14 | |||
13 | extern __read_mostly int scheduler_running; | 15 | extern __read_mostly int scheduler_running; |
14 | 16 | ||
17 | extern unsigned long calc_load_update; | ||
18 | extern atomic_long_t calc_load_tasks; | ||
19 | |||
20 | extern long calc_load_fold_active(struct rq *this_rq); | ||
21 | extern void update_cpu_load_active(struct rq *this_rq); | ||
22 | |||
15 | /* | 23 | /* |
16 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 24 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
17 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 25 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -140,10 +148,11 @@ struct task_group { | |||
140 | struct cfs_rq **cfs_rq; | 148 | struct cfs_rq **cfs_rq; |
141 | unsigned long shares; | 149 | unsigned long shares; |
142 | 150 | ||
143 | atomic_t load_weight; | 151 | #ifdef CONFIG_SMP |
144 | atomic64_t load_avg; | 152 | atomic_long_t load_avg; |
145 | atomic_t runnable_avg; | 153 | atomic_t runnable_avg; |
146 | #endif | 154 | #endif |
155 | #endif | ||
147 | 156 | ||
148 | #ifdef CONFIG_RT_GROUP_SCHED | 157 | #ifdef CONFIG_RT_GROUP_SCHED |
149 | struct sched_rt_entity **rt_se; | 158 | struct sched_rt_entity **rt_se; |
@@ -261,26 +270,21 @@ struct cfs_rq { | |||
261 | #endif | 270 | #endif |
262 | 271 | ||
263 | #ifdef CONFIG_SMP | 272 | #ifdef CONFIG_SMP |
264 | /* | ||
265 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
266 | * removed when useful for applications beyond shares distribution (e.g. | ||
267 | * load-balance). | ||
268 | */ | ||
269 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
270 | /* | 273 | /* |
271 | * CFS Load tracking | 274 | * CFS Load tracking |
272 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 275 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
273 | * This allows for the description of both thread and group usage (in | 276 | * This allows for the description of both thread and group usage (in |
274 | * the FAIR_GROUP_SCHED case). | 277 | * the FAIR_GROUP_SCHED case). |
275 | */ | 278 | */ |
276 | u64 runnable_load_avg, blocked_load_avg; | 279 | unsigned long runnable_load_avg, blocked_load_avg; |
277 | atomic64_t decay_counter, removed_load; | 280 | atomic64_t decay_counter; |
278 | u64 last_decay; | 281 | u64 last_decay; |
279 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 282 | atomic_long_t removed_load; |
280 | /* These always depend on CONFIG_FAIR_GROUP_SCHED */ | 283 | |
281 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
285 | /* Required to track per-cpu representation of a task_group */ | ||
282 | u32 tg_runnable_contrib; | 286 | u32 tg_runnable_contrib; |
283 | u64 tg_load_contrib; | 287 | unsigned long tg_load_contrib; |
284 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 288 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
285 | 289 | ||
286 | /* | 290 | /* |
@@ -353,7 +357,6 @@ struct rt_rq { | |||
353 | unsigned long rt_nr_boosted; | 357 | unsigned long rt_nr_boosted; |
354 | 358 | ||
355 | struct rq *rq; | 359 | struct rq *rq; |
356 | struct list_head leaf_rt_rq_list; | ||
357 | struct task_group *tg; | 360 | struct task_group *tg; |
358 | #endif | 361 | #endif |
359 | }; | 362 | }; |
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues); | |||
540 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 543 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
541 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | 544 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) |
542 | 545 | ||
546 | static inline u64 rq_clock(struct rq *rq) | ||
547 | { | ||
548 | return rq->clock; | ||
549 | } | ||
550 | |||
551 | static inline u64 rq_clock_task(struct rq *rq) | ||
552 | { | ||
553 | return rq->clock_task; | ||
554 | } | ||
555 | |||
543 | #ifdef CONFIG_SMP | 556 | #ifdef CONFIG_SMP |
544 | 557 | ||
545 | #define rcu_dereference_check_sched_domain(p) \ | 558 | #define rcu_dereference_check_sched_domain(p) \ |
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
884 | #define WF_FORK 0x02 /* child wakeup after fork */ | 897 | #define WF_FORK 0x02 /* child wakeup after fork */ |
885 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | 898 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ |
886 | 899 | ||
887 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
888 | { | ||
889 | lw->weight += inc; | ||
890 | lw->inv_weight = 0; | ||
891 | } | ||
892 | |||
893 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
894 | { | ||
895 | lw->weight -= dec; | ||
896 | lw->inv_weight = 0; | ||
897 | } | ||
898 | |||
899 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
900 | { | ||
901 | lw->weight = w; | ||
902 | lw->inv_weight = 0; | ||
903 | } | ||
904 | |||
905 | /* | 900 | /* |
906 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 901 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
907 | * of tasks with abnormal "nice" values across CPUs the contribution that | 902 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu); | |||
1028 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1023 | extern void trigger_load_balance(struct rq *rq, int cpu); |
1029 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1024 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
1030 | 1025 | ||
1031 | /* | ||
1032 | * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg | ||
1033 | * becomes useful in lb | ||
1034 | */ | ||
1035 | #if defined(CONFIG_FAIR_GROUP_SCHED) | ||
1036 | extern void idle_enter_fair(struct rq *this_rq); | 1026 | extern void idle_enter_fair(struct rq *this_rq); |
1037 | extern void idle_exit_fair(struct rq *this_rq); | 1027 | extern void idle_exit_fair(struct rq *this_rq); |
1038 | #else | ||
1039 | static inline void idle_enter_fair(struct rq *this_rq) {} | ||
1040 | static inline void idle_exit_fair(struct rq *this_rq) {} | ||
1041 | #endif | ||
1042 | 1028 | ||
1043 | #else /* CONFIG_SMP */ | 1029 | #else /* CONFIG_SMP */ |
1044 | 1030 | ||
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
1051 | extern void sysrq_sched_debug_show(void); | 1037 | extern void sysrq_sched_debug_show(void); |
1052 | extern void sched_init_granularity(void); | 1038 | extern void sched_init_granularity(void); |
1053 | extern void update_max_interval(void); | 1039 | extern void update_max_interval(void); |
1054 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | ||
1055 | extern void init_sched_rt_class(void); | 1040 | extern void init_sched_rt_class(void); |
1056 | extern void init_sched_fair_class(void); | 1041 | extern void init_sched_fair_class(void); |
1057 | 1042 | ||
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
1063 | 1048 | ||
1064 | extern void update_idle_cpu_load(struct rq *this_rq); | 1049 | extern void update_idle_cpu_load(struct rq *this_rq); |
1065 | 1050 | ||
1051 | extern void init_task_runnable_average(struct task_struct *p); | ||
1052 | |||
1066 | #ifdef CONFIG_PARAVIRT | 1053 | #ifdef CONFIG_PARAVIRT |
1067 | static inline u64 steal_ticks(u64 steal) | 1054 | static inline u64 steal_ticks(u64 steal) |
1068 | { | 1055 | { |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..5aef494fc8b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
61 | */ | 61 | */ |
62 | static inline void sched_info_dequeued(struct task_struct *t) | 62 | static inline void sched_info_dequeued(struct task_struct *t) |
63 | { | 63 | { |
64 | unsigned long long now = task_rq(t)->clock, delta = 0; | 64 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; |
65 | 65 | ||
66 | if (unlikely(sched_info_on())) | 66 | if (unlikely(sched_info_on())) |
67 | if (t->sched_info.last_queued) | 67 | if (t->sched_info.last_queued) |
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
79 | */ | 79 | */ |
80 | static void sched_info_arrive(struct task_struct *t) | 80 | static void sched_info_arrive(struct task_struct *t) |
81 | { | 81 | { |
82 | unsigned long long now = task_rq(t)->clock, delta = 0; | 82 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; |
83 | 83 | ||
84 | if (t->sched_info.last_queued) | 84 | if (t->sched_info.last_queued) |
85 | delta = now - t->sched_info.last_queued; | 85 | delta = now - t->sched_info.last_queued; |
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t) | |||
100 | { | 100 | { |
101 | if (unlikely(sched_info_on())) | 101 | if (unlikely(sched_info_on())) |
102 | if (!t->sched_info.last_queued) | 102 | if (!t->sched_info.last_queued) |
103 | t->sched_info.last_queued = task_rq(t)->clock; | 103 | t->sched_info.last_queued = rq_clock(task_rq(t)); |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t) | |||
112 | */ | 112 | */ |
113 | static inline void sched_info_depart(struct task_struct *t) | 113 | static inline void sched_info_depart(struct task_struct *t) |
114 | { | 114 | { |
115 | unsigned long long delta = task_rq(t)->clock - | 115 | unsigned long long delta = rq_clock(task_rq(t)) - |
116 | t->sched_info.last_arrival; | 116 | t->sched_info.last_arrival; |
117 | 117 | ||
118 | rq_sched_info_depart(task_rq(t), delta); | 118 | rq_sched_info_depart(task_rq(t), delta); |
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
162 | */ | 162 | */ |
163 | 163 | ||
164 | /** | 164 | /** |
165 | * cputimer_running - return true if cputimer is running | ||
166 | * | ||
167 | * @tsk: Pointer to target task. | ||
168 | */ | ||
169 | static inline bool cputimer_running(struct task_struct *tsk) | ||
170 | |||
171 | { | ||
172 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | ||
173 | |||
174 | if (!cputimer->running) | ||
175 | return false; | ||
176 | |||
177 | /* | ||
178 | * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime | ||
179 | * in __exit_signal(), we won't account to the signal struct further | ||
180 | * cputime consumed by that task, even though the task can still be | ||
181 | * ticking after __exit_signal(). | ||
182 | * | ||
183 | * In order to keep a consistent behaviour between thread group cputime | ||
184 | * and thread group cputimer accounting, lets also ignore the cputime | ||
185 | * elapsing after __exit_signal() in any thread group timer running. | ||
186 | * | ||
187 | * This makes sure that POSIX CPU clocks and timers are synchronized, so | ||
188 | * that a POSIX CPU timer won't expire while the corresponding POSIX CPU | ||
189 | * clock delta is behind the expiring timer value. | ||
190 | */ | ||
191 | if (unlikely(!tsk->sighand)) | ||
192 | return false; | ||
193 | |||
194 | return true; | ||
195 | } | ||
196 | |||
197 | /** | ||
165 | * account_group_user_time - Maintain utime for a thread group. | 198 | * account_group_user_time - Maintain utime for a thread group. |
166 | * | 199 | * |
167 | * @tsk: Pointer to task structure. | 200 | * @tsk: Pointer to task structure. |
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
176 | { | 209 | { |
177 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 210 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
178 | 211 | ||
179 | if (!cputimer->running) | 212 | if (!cputimer_running(tsk)) |
180 | return; | 213 | return; |
181 | 214 | ||
182 | raw_spin_lock(&cputimer->lock); | 215 | raw_spin_lock(&cputimer->lock); |
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
199 | { | 232 | { |
200 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 233 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
201 | 234 | ||
202 | if (!cputimer->running) | 235 | if (!cputimer_running(tsk)) |
203 | return; | 236 | return; |
204 | 237 | ||
205 | raw_spin_lock(&cputimer->lock); | 238 | raw_spin_lock(&cputimer->lock); |
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, | |||
222 | { | 255 | { |
223 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 256 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
224 | 257 | ||
225 | if (!cputimer->running) | 258 | if (!cputimer_running(tsk)) |
226 | return; | 259 | return; |
227 | 260 | ||
228 | raw_spin_lock(&cputimer->lock); | 261 | raw_spin_lock(&cputimer->lock); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84a..e08fbeeb54b9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
28 | struct task_struct *stop = rq->stop; | 28 | struct task_struct *stop = rq->stop; |
29 | 29 | ||
30 | if (stop && stop->on_rq) { | 30 | if (stop && stop->on_rq) { |
31 | stop->se.exec_start = rq->clock_task; | 31 | stop->se.exec_start = rq_clock_task(rq); |
32 | return stop; | 32 | return stop; |
33 | } | 33 | } |
34 | 34 | ||
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
57 | struct task_struct *curr = rq->curr; | 57 | struct task_struct *curr = rq->curr; |
58 | u64 delta_exec; | 58 | u64 delta_exec; |
59 | 59 | ||
60 | delta_exec = rq->clock_task - curr->se.exec_start; | 60 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
61 | if (unlikely((s64)delta_exec < 0)) | 61 | if (unlikely((s64)delta_exec < 0)) |
62 | delta_exec = 0; | 62 | delta_exec = 0; |
63 | 63 | ||
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
67 | curr->se.sum_exec_runtime += delta_exec; | 67 | curr->se.sum_exec_runtime += delta_exec; |
68 | account_group_exec_runtime(curr, delta_exec); | 68 | account_group_exec_runtime(curr, delta_exec); |
69 | 69 | ||
70 | curr->se.exec_start = rq->clock_task; | 70 | curr->se.exec_start = rq_clock_task(rq); |
71 | cpuacct_charge(curr, delta_exec); | 71 | cpuacct_charge(curr, delta_exec); |
72 | } | 72 | } |
73 | 73 | ||
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq) | |||
79 | { | 79 | { |
80 | struct task_struct *stop = rq->stop; | 80 | struct task_struct *stop = rq->stop; |
81 | 81 | ||
82 | stop->se.exec_start = rq->clock_task; | 82 | stop->se.exec_start = rq_clock_task(rq); |
83 | } | 83 | } |
84 | 84 | ||
85 | static void switched_to_stop(struct rq *rq, struct task_struct *p) | 85 | static void switched_to_stop(struct rq *rq, struct task_struct *p) |