aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c637
-rw-r--r--kernel/sched/cputime.c5
-rw-r--r--kernel/sched/debug.c37
-rw-r--r--kernel/sched/fair.c175
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c132
-rw-r--r--kernel/sched/sched.h71
-rw-r--r--kernel/sched/stats.h47
-rw-r--r--kernel/sched/stop_task.c8
11 files changed, 851 insertions, 857 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e4a1de..54adcf35f495 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8b0c9e..4a073539c58e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
77 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
78 goto out_free; 78 goto out_free;
79 79
80 sched_online_group(tg, &root_task_group);
81
82 kref_init(&ag->kref); 80 kref_init(&ag->kref);
83 init_rwsem(&ag->lock); 81 init_rwsem(&ag->lock);
84 ag->id = atomic_inc_return(&autogroup_seq_nr); 82 ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
98#endif 96#endif
99 tg->autogroup = ag; 97 tg->autogroup = ag;
100 98
99 sched_online_group(tg, &root_task_group);
101 return ag; 100 return ag;
102 101
103out_free: 102out_free:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b335016c52..9b1f2e533b95 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -679,7 +679,7 @@ void sched_avg_update(struct rq *rq)
679{ 679{
680 s64 period = sched_avg_period(); 680 s64 period = sched_avg_period();
681 681
682 while ((s64)(rq->clock - rq->age_stamp) > period) { 682 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
683 /* 683 /*
684 * Inline assembly required to prevent the compiler 684 * Inline assembly required to prevent the compiler
685 * optimising this loop into a divmod call. 685 * optimising this loop into a divmod call.
@@ -1340,7 +1340,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1340 p->sched_class->task_woken(rq, p); 1340 p->sched_class->task_woken(rq, p);
1341 1341
1342 if (rq->idle_stamp) { 1342 if (rq->idle_stamp) {
1343 u64 delta = rq->clock - rq->idle_stamp; 1343 u64 delta = rq_clock(rq) - rq->idle_stamp;
1344 u64 max = 2*sysctl_sched_migration_cost; 1344 u64 max = 2*sysctl_sched_migration_cost;
1345 1345
1346 if (delta > max) 1346 if (delta > max)
@@ -1377,6 +1377,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1377 1377
1378 rq = __task_rq_lock(p); 1378 rq = __task_rq_lock(p);
1379 if (p->on_rq) { 1379 if (p->on_rq) {
1380 /* check_preempt_curr() may use rq clock */
1381 update_rq_clock(rq);
1380 ttwu_do_wakeup(rq, p, wake_flags); 1382 ttwu_do_wakeup(rq, p, wake_flags);
1381 ret = 1; 1383 ret = 1;
1382 } 1384 }
@@ -1609,15 +1611,6 @@ static void __sched_fork(struct task_struct *p)
1609 p->se.vruntime = 0; 1611 p->se.vruntime = 0;
1610 INIT_LIST_HEAD(&p->se.group_node); 1612 INIT_LIST_HEAD(&p->se.group_node);
1611 1613
1612/*
1613 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1614 * removed when useful for applications beyond shares distribution (e.g.
1615 * load-balance).
1616 */
1617#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1618 p->se.avg.runnable_avg_period = 0;
1619 p->se.avg.runnable_avg_sum = 0;
1620#endif
1621#ifdef CONFIG_SCHEDSTATS 1614#ifdef CONFIG_SCHEDSTATS
1622 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1615 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1623#endif 1616#endif
@@ -1761,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p)
1761 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1754 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1762#endif 1755#endif
1763 1756
1757 /* Initialize new task's runnable average */
1758 init_task_runnable_average(p);
1764 rq = __task_rq_lock(p); 1759 rq = __task_rq_lock(p);
1765 activate_task(rq, p, 0); 1760 activate_task(rq, p, 0);
1766 p->on_rq = 1; 1761 p->on_rq = 1;
@@ -2069,575 +2064,6 @@ unsigned long nr_iowait_cpu(int cpu)
2069 return atomic_read(&this->nr_iowait); 2064 return atomic_read(&this->nr_iowait);
2070} 2065}
2071 2066
2072unsigned long this_cpu_load(void)
2073{
2074 struct rq *this = this_rq();
2075 return this->cpu_load[0];
2076}
2077
2078
2079/*
2080 * Global load-average calculations
2081 *
2082 * We take a distributed and async approach to calculating the global load-avg
2083 * in order to minimize overhead.
2084 *
2085 * The global load average is an exponentially decaying average of nr_running +
2086 * nr_uninterruptible.
2087 *
2088 * Once every LOAD_FREQ:
2089 *
2090 * nr_active = 0;
2091 * for_each_possible_cpu(cpu)
2092 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2093 *
2094 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2095 *
2096 * Due to a number of reasons the above turns in the mess below:
2097 *
2098 * - for_each_possible_cpu() is prohibitively expensive on machines with
2099 * serious number of cpus, therefore we need to take a distributed approach
2100 * to calculating nr_active.
2101 *
2102 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2103 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2104 *
2105 * So assuming nr_active := 0 when we start out -- true per definition, we
2106 * can simply take per-cpu deltas and fold those into a global accumulate
2107 * to obtain the same result. See calc_load_fold_active().
2108 *
2109 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2110 * across the machine, we assume 10 ticks is sufficient time for every
2111 * cpu to have completed this task.
2112 *
2113 * This places an upper-bound on the IRQ-off latency of the machine. Then
2114 * again, being late doesn't loose the delta, just wrecks the sample.
2115 *
2116 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2117 * this would add another cross-cpu cacheline miss and atomic operation
2118 * to the wakeup path. Instead we increment on whatever cpu the task ran
2119 * when it went into uninterruptible state and decrement on whatever cpu
2120 * did the wakeup. This means that only the sum of nr_uninterruptible over
2121 * all cpus yields the correct result.
2122 *
2123 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2124 */
2125
2126/* Variables and functions for calc_load */
2127static atomic_long_t calc_load_tasks;
2128static unsigned long calc_load_update;
2129unsigned long avenrun[3];
2130EXPORT_SYMBOL(avenrun); /* should be removed */
2131
2132/**
2133 * get_avenrun - get the load average array
2134 * @loads: pointer to dest load array
2135 * @offset: offset to add
2136 * @shift: shift count to shift the result left
2137 *
2138 * These values are estimates at best, so no need for locking.
2139 */
2140void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2141{
2142 loads[0] = (avenrun[0] + offset) << shift;
2143 loads[1] = (avenrun[1] + offset) << shift;
2144 loads[2] = (avenrun[2] + offset) << shift;
2145}
2146
2147static long calc_load_fold_active(struct rq *this_rq)
2148{
2149 long nr_active, delta = 0;
2150
2151 nr_active = this_rq->nr_running;
2152 nr_active += (long) this_rq->nr_uninterruptible;
2153
2154 if (nr_active != this_rq->calc_load_active) {
2155 delta = nr_active - this_rq->calc_load_active;
2156 this_rq->calc_load_active = nr_active;
2157 }
2158
2159 return delta;
2160}
2161
2162/*
2163 * a1 = a0 * e + a * (1 - e)
2164 */
2165static unsigned long
2166calc_load(unsigned long load, unsigned long exp, unsigned long active)
2167{
2168 load *= exp;
2169 load += active * (FIXED_1 - exp);
2170 load += 1UL << (FSHIFT - 1);
2171 return load >> FSHIFT;
2172}
2173
2174#ifdef CONFIG_NO_HZ_COMMON
2175/*
2176 * Handle NO_HZ for the global load-average.
2177 *
2178 * Since the above described distributed algorithm to compute the global
2179 * load-average relies on per-cpu sampling from the tick, it is affected by
2180 * NO_HZ.
2181 *
2182 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2183 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2184 * when we read the global state.
2185 *
2186 * Obviously reality has to ruin such a delightfully simple scheme:
2187 *
2188 * - When we go NO_HZ idle during the window, we can negate our sample
2189 * contribution, causing under-accounting.
2190 *
2191 * We avoid this by keeping two idle-delta counters and flipping them
2192 * when the window starts, thus separating old and new NO_HZ load.
2193 *
2194 * The only trick is the slight shift in index flip for read vs write.
2195 *
2196 * 0s 5s 10s 15s
2197 * +10 +10 +10 +10
2198 * |-|-----------|-|-----------|-|-----------|-|
2199 * r:0 0 1 1 0 0 1 1 0
2200 * w:0 1 1 0 0 1 1 0 0
2201 *
2202 * This ensures we'll fold the old idle contribution in this window while
2203 * accumlating the new one.
2204 *
2205 * - When we wake up from NO_HZ idle during the window, we push up our
2206 * contribution, since we effectively move our sample point to a known
2207 * busy state.
2208 *
2209 * This is solved by pushing the window forward, and thus skipping the
2210 * sample, for this cpu (effectively using the idle-delta for this cpu which
2211 * was in effect at the time the window opened). This also solves the issue
2212 * of having to deal with a cpu having been in NOHZ idle for multiple
2213 * LOAD_FREQ intervals.
2214 *
2215 * When making the ILB scale, we should try to pull this in as well.
2216 */
2217static atomic_long_t calc_load_idle[2];
2218static int calc_load_idx;
2219
2220static inline int calc_load_write_idx(void)
2221{
2222 int idx = calc_load_idx;
2223
2224 /*
2225 * See calc_global_nohz(), if we observe the new index, we also
2226 * need to observe the new update time.
2227 */
2228 smp_rmb();
2229
2230 /*
2231 * If the folding window started, make sure we start writing in the
2232 * next idle-delta.
2233 */
2234 if (!time_before(jiffies, calc_load_update))
2235 idx++;
2236
2237 return idx & 1;
2238}
2239
2240static inline int calc_load_read_idx(void)
2241{
2242 return calc_load_idx & 1;
2243}
2244
2245void calc_load_enter_idle(void)
2246{
2247 struct rq *this_rq = this_rq();
2248 long delta;
2249
2250 /*
2251 * We're going into NOHZ mode, if there's any pending delta, fold it
2252 * into the pending idle delta.
2253 */
2254 delta = calc_load_fold_active(this_rq);
2255 if (delta) {
2256 int idx = calc_load_write_idx();
2257 atomic_long_add(delta, &calc_load_idle[idx]);
2258 }
2259}
2260
2261void calc_load_exit_idle(void)
2262{
2263 struct rq *this_rq = this_rq();
2264
2265 /*
2266 * If we're still before the sample window, we're done.
2267 */
2268 if (time_before(jiffies, this_rq->calc_load_update))
2269 return;
2270
2271 /*
2272 * We woke inside or after the sample window, this means we're already
2273 * accounted through the nohz accounting, so skip the entire deal and
2274 * sync up for the next window.
2275 */
2276 this_rq->calc_load_update = calc_load_update;
2277 if (time_before(jiffies, this_rq->calc_load_update + 10))
2278 this_rq->calc_load_update += LOAD_FREQ;
2279}
2280
2281static long calc_load_fold_idle(void)
2282{
2283 int idx = calc_load_read_idx();
2284 long delta = 0;
2285
2286 if (atomic_long_read(&calc_load_idle[idx]))
2287 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2288
2289 return delta;
2290}
2291
2292/**
2293 * fixed_power_int - compute: x^n, in O(log n) time
2294 *
2295 * @x: base of the power
2296 * @frac_bits: fractional bits of @x
2297 * @n: power to raise @x to.
2298 *
2299 * By exploiting the relation between the definition of the natural power
2300 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2301 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2302 * (where: n_i \elem {0, 1}, the binary vector representing n),
2303 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2304 * of course trivially computable in O(log_2 n), the length of our binary
2305 * vector.
2306 */
2307static unsigned long
2308fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2309{
2310 unsigned long result = 1UL << frac_bits;
2311
2312 if (n) for (;;) {
2313 if (n & 1) {
2314 result *= x;
2315 result += 1UL << (frac_bits - 1);
2316 result >>= frac_bits;
2317 }
2318 n >>= 1;
2319 if (!n)
2320 break;
2321 x *= x;
2322 x += 1UL << (frac_bits - 1);
2323 x >>= frac_bits;
2324 }
2325
2326 return result;
2327}
2328
2329/*
2330 * a1 = a0 * e + a * (1 - e)
2331 *
2332 * a2 = a1 * e + a * (1 - e)
2333 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2334 * = a0 * e^2 + a * (1 - e) * (1 + e)
2335 *
2336 * a3 = a2 * e + a * (1 - e)
2337 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2338 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2339 *
2340 * ...
2341 *
2342 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2343 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2344 * = a0 * e^n + a * (1 - e^n)
2345 *
2346 * [1] application of the geometric series:
2347 *
2348 * n 1 - x^(n+1)
2349 * S_n := \Sum x^i = -------------
2350 * i=0 1 - x
2351 */
2352static unsigned long
2353calc_load_n(unsigned long load, unsigned long exp,
2354 unsigned long active, unsigned int n)
2355{
2356
2357 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2358}
2359
2360/*
2361 * NO_HZ can leave us missing all per-cpu ticks calling
2362 * calc_load_account_active(), but since an idle CPU folds its delta into
2363 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2364 * in the pending idle delta if our idle period crossed a load cycle boundary.
2365 *
2366 * Once we've updated the global active value, we need to apply the exponential
2367 * weights adjusted to the number of cycles missed.
2368 */
2369static void calc_global_nohz(void)
2370{
2371 long delta, active, n;
2372
2373 if (!time_before(jiffies, calc_load_update + 10)) {
2374 /*
2375 * Catch-up, fold however many we are behind still
2376 */
2377 delta = jiffies - calc_load_update - 10;
2378 n = 1 + (delta / LOAD_FREQ);
2379
2380 active = atomic_long_read(&calc_load_tasks);
2381 active = active > 0 ? active * FIXED_1 : 0;
2382
2383 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2384 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2385 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2386
2387 calc_load_update += n * LOAD_FREQ;
2388 }
2389
2390 /*
2391 * Flip the idle index...
2392 *
2393 * Make sure we first write the new time then flip the index, so that
2394 * calc_load_write_idx() will see the new time when it reads the new
2395 * index, this avoids a double flip messing things up.
2396 */
2397 smp_wmb();
2398 calc_load_idx++;
2399}
2400#else /* !CONFIG_NO_HZ_COMMON */
2401
2402static inline long calc_load_fold_idle(void) { return 0; }
2403static inline void calc_global_nohz(void) { }
2404
2405#endif /* CONFIG_NO_HZ_COMMON */
2406
2407/*
2408 * calc_load - update the avenrun load estimates 10 ticks after the
2409 * CPUs have updated calc_load_tasks.
2410 */
2411void calc_global_load(unsigned long ticks)
2412{
2413 long active, delta;
2414
2415 if (time_before(jiffies, calc_load_update + 10))
2416 return;
2417
2418 /*
2419 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2420 */
2421 delta = calc_load_fold_idle();
2422 if (delta)
2423 atomic_long_add(delta, &calc_load_tasks);
2424
2425 active = atomic_long_read(&calc_load_tasks);
2426 active = active > 0 ? active * FIXED_1 : 0;
2427
2428 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2429 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2430 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2431
2432 calc_load_update += LOAD_FREQ;
2433
2434 /*
2435 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2436 */
2437 calc_global_nohz();
2438}
2439
2440/*
2441 * Called from update_cpu_load() to periodically update this CPU's
2442 * active count.
2443 */
2444static void calc_load_account_active(struct rq *this_rq)
2445{
2446 long delta;
2447
2448 if (time_before(jiffies, this_rq->calc_load_update))
2449 return;
2450
2451 delta = calc_load_fold_active(this_rq);
2452 if (delta)
2453 atomic_long_add(delta, &calc_load_tasks);
2454
2455 this_rq->calc_load_update += LOAD_FREQ;
2456}
2457
2458/*
2459 * End of global load-average stuff
2460 */
2461
2462/*
2463 * The exact cpuload at various idx values, calculated at every tick would be
2464 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2465 *
2466 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2467 * on nth tick when cpu may be busy, then we have:
2468 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2469 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2470 *
2471 * decay_load_missed() below does efficient calculation of
2472 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2473 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2474 *
2475 * The calculation is approximated on a 128 point scale.
2476 * degrade_zero_ticks is the number of ticks after which load at any
2477 * particular idx is approximated to be zero.
2478 * degrade_factor is a precomputed table, a row for each load idx.
2479 * Each column corresponds to degradation factor for a power of two ticks,
2480 * based on 128 point scale.
2481 * Example:
2482 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2483 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2484 *
2485 * With this power of 2 load factors, we can degrade the load n times
2486 * by looking at 1 bits in n and doing as many mult/shift instead of
2487 * n mult/shifts needed by the exact degradation.
2488 */
2489#define DEGRADE_SHIFT 7
2490static const unsigned char
2491 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2492static const unsigned char
2493 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2494 {0, 0, 0, 0, 0, 0, 0, 0},
2495 {64, 32, 8, 0, 0, 0, 0, 0},
2496 {96, 72, 40, 12, 1, 0, 0},
2497 {112, 98, 75, 43, 15, 1, 0},
2498 {120, 112, 98, 76, 45, 16, 2} };
2499
2500/*
2501 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2502 * would be when CPU is idle and so we just decay the old load without
2503 * adding any new load.
2504 */
2505static unsigned long
2506decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2507{
2508 int j = 0;
2509
2510 if (!missed_updates)
2511 return load;
2512
2513 if (missed_updates >= degrade_zero_ticks[idx])
2514 return 0;
2515
2516 if (idx == 1)
2517 return load >> missed_updates;
2518
2519 while (missed_updates) {
2520 if (missed_updates % 2)
2521 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2522
2523 missed_updates >>= 1;
2524 j++;
2525 }
2526 return load;
2527}
2528
2529/*
2530 * Update rq->cpu_load[] statistics. This function is usually called every
2531 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2532 * every tick. We fix it up based on jiffies.
2533 */
2534static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2535 unsigned long pending_updates)
2536{
2537 int i, scale;
2538
2539 this_rq->nr_load_updates++;
2540
2541 /* Update our load: */
2542 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2543 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2544 unsigned long old_load, new_load;
2545
2546 /* scale is effectively 1 << i now, and >> i divides by scale */
2547
2548 old_load = this_rq->cpu_load[i];
2549 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2550 new_load = this_load;
2551 /*
2552 * Round up the averaging division if load is increasing. This
2553 * prevents us from getting stuck on 9 if the load is 10, for
2554 * example.
2555 */
2556 if (new_load > old_load)
2557 new_load += scale - 1;
2558
2559 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2560 }
2561
2562 sched_avg_update(this_rq);
2563}
2564
2565#ifdef CONFIG_NO_HZ_COMMON
2566/*
2567 * There is no sane way to deal with nohz on smp when using jiffies because the
2568 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2569 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2570 *
2571 * Therefore we cannot use the delta approach from the regular tick since that
2572 * would seriously skew the load calculation. However we'll make do for those
2573 * updates happening while idle (nohz_idle_balance) or coming out of idle
2574 * (tick_nohz_idle_exit).
2575 *
2576 * This means we might still be one tick off for nohz periods.
2577 */
2578
2579/*
2580 * Called from nohz_idle_balance() to update the load ratings before doing the
2581 * idle balance.
2582 */
2583void update_idle_cpu_load(struct rq *this_rq)
2584{
2585 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2586 unsigned long load = this_rq->load.weight;
2587 unsigned long pending_updates;
2588
2589 /*
2590 * bail if there's load or we're actually up-to-date.
2591 */
2592 if (load || curr_jiffies == this_rq->last_load_update_tick)
2593 return;
2594
2595 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2596 this_rq->last_load_update_tick = curr_jiffies;
2597
2598 __update_cpu_load(this_rq, load, pending_updates);
2599}
2600
2601/*
2602 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2603 */
2604void update_cpu_load_nohz(void)
2605{
2606 struct rq *this_rq = this_rq();
2607 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2608 unsigned long pending_updates;
2609
2610 if (curr_jiffies == this_rq->last_load_update_tick)
2611 return;
2612
2613 raw_spin_lock(&this_rq->lock);
2614 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2615 if (pending_updates) {
2616 this_rq->last_load_update_tick = curr_jiffies;
2617 /*
2618 * We were idle, this means load 0, the current load might be
2619 * !0 due to remote wakeups and the sort.
2620 */
2621 __update_cpu_load(this_rq, 0, pending_updates);
2622 }
2623 raw_spin_unlock(&this_rq->lock);
2624}
2625#endif /* CONFIG_NO_HZ_COMMON */
2626
2627/*
2628 * Called from scheduler_tick()
2629 */
2630static void update_cpu_load_active(struct rq *this_rq)
2631{
2632 /*
2633 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2634 */
2635 this_rq->last_load_update_tick = jiffies;
2636 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2637
2638 calc_load_account_active(this_rq);
2639}
2640
2641#ifdef CONFIG_SMP 2067#ifdef CONFIG_SMP
2642 2068
2643/* 2069/*
@@ -2686,7 +2112,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2686 2112
2687 if (task_current(rq, p)) { 2113 if (task_current(rq, p)) {
2688 update_rq_clock(rq); 2114 update_rq_clock(rq);
2689 ns = rq->clock_task - p->se.exec_start; 2115 ns = rq_clock_task(rq) - p->se.exec_start;
2690 if ((s64)ns < 0) 2116 if ((s64)ns < 0)
2691 ns = 0; 2117 ns = 0;
2692 } 2118 }
@@ -2739,8 +2165,8 @@ void scheduler_tick(void)
2739 2165
2740 raw_spin_lock(&rq->lock); 2166 raw_spin_lock(&rq->lock);
2741 update_rq_clock(rq); 2167 update_rq_clock(rq);
2742 update_cpu_load_active(rq);
2743 curr->sched_class->task_tick(rq, curr, 0); 2168 curr->sched_class->task_tick(rq, curr, 0);
2169 update_cpu_load_active(rq);
2744 raw_spin_unlock(&rq->lock); 2170 raw_spin_unlock(&rq->lock);
2745 2171
2746 perf_event_task_tick(); 2172 perf_event_task_tick();
@@ -4960,6 +4386,13 @@ static void migrate_tasks(unsigned int dead_cpu)
4960 */ 4386 */
4961 rq->stop = NULL; 4387 rq->stop = NULL;
4962 4388
4389 /*
4390 * put_prev_task() and pick_next_task() sched
4391 * class method both need to have an up-to-date
4392 * value of rq->clock[_task]
4393 */
4394 update_rq_clock(rq);
4395
4963 for ( ; ; ) { 4396 for ( ; ; ) {
4964 /* 4397 /*
4965 * There's this thread running, bail when that's the only 4398 * There's this thread running, bail when that's the only
@@ -5093,7 +4526,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5093 return table; 4526 return table;
5094} 4527}
5095 4528
5096static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4529static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5097{ 4530{
5098 struct ctl_table *entry, *table; 4531 struct ctl_table *entry, *table;
5099 struct sched_domain *sd; 4532 struct sched_domain *sd;
@@ -5907,7 +5340,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5907 get_group(cpu, sdd, &sd->groups); 5340 get_group(cpu, sdd, &sd->groups);
5908 atomic_inc(&sd->groups->ref); 5341 atomic_inc(&sd->groups->ref);
5909 5342
5910 if (cpu != cpumask_first(sched_domain_span(sd))) 5343 if (cpu != cpumask_first(span))
5911 return 0; 5344 return 0;
5912 5345
5913 lockdep_assert_held(&sched_domains_mutex); 5346 lockdep_assert_held(&sched_domains_mutex);
@@ -5917,12 +5350,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5917 5350
5918 for_each_cpu(i, span) { 5351 for_each_cpu(i, span) {
5919 struct sched_group *sg; 5352 struct sched_group *sg;
5920 int group = get_group(i, sdd, &sg); 5353 int group, j;
5921 int j;
5922 5354
5923 if (cpumask_test_cpu(i, covered)) 5355 if (cpumask_test_cpu(i, covered))
5924 continue; 5356 continue;
5925 5357
5358 group = get_group(i, sdd, &sg);
5926 cpumask_clear(sched_group_cpus(sg)); 5359 cpumask_clear(sched_group_cpus(sg));
5927 sg->sgp->power = 0; 5360 sg->sgp->power = 0;
5928 cpumask_setall(sched_group_mask(sg)); 5361 cpumask_setall(sched_group_mask(sg));
@@ -5960,7 +5393,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5960{ 5393{
5961 struct sched_group *sg = sd->groups; 5394 struct sched_group *sg = sd->groups;
5962 5395
5963 WARN_ON(!sd || !sg); 5396 WARN_ON(!sg);
5964 5397
5965 do { 5398 do {
5966 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5399 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6125,6 +5558,9 @@ static struct sched_domain_topology_level default_topology[] = {
6125 5558
6126static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5559static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6127 5560
5561#define for_each_sd_topology(tl) \
5562 for (tl = sched_domain_topology; tl->init; tl++)
5563
6128#ifdef CONFIG_NUMA 5564#ifdef CONFIG_NUMA
6129 5565
6130static int sched_domains_numa_levels; 5566static int sched_domains_numa_levels;
@@ -6422,7 +5858,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6422 struct sched_domain_topology_level *tl; 5858 struct sched_domain_topology_level *tl;
6423 int j; 5859 int j;
6424 5860
6425 for (tl = sched_domain_topology; tl->init; tl++) { 5861 for_each_sd_topology(tl) {
6426 struct sd_data *sdd = &tl->data; 5862 struct sd_data *sdd = &tl->data;
6427 5863
6428 sdd->sd = alloc_percpu(struct sched_domain *); 5864 sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6475,7 +5911,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
6475 struct sched_domain_topology_level *tl; 5911 struct sched_domain_topology_level *tl;
6476 int j; 5912 int j;
6477 5913
6478 for (tl = sched_domain_topology; tl->init; tl++) { 5914 for_each_sd_topology(tl) {
6479 struct sd_data *sdd = &tl->data; 5915 struct sd_data *sdd = &tl->data;
6480 5916
6481 for_each_cpu(j, cpu_map) { 5917 for_each_cpu(j, cpu_map) {
@@ -6503,9 +5939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6503} 5939}
6504 5940
6505struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 5941struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6506 struct s_data *d, const struct cpumask *cpu_map, 5942 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6507 struct sched_domain_attr *attr, struct sched_domain *child, 5943 struct sched_domain *child, int cpu)
6508 int cpu)
6509{ 5944{
6510 struct sched_domain *sd = tl->init(tl, cpu); 5945 struct sched_domain *sd = tl->init(tl, cpu);
6511 if (!sd) 5946 if (!sd)
@@ -6516,8 +5951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6516 sd->level = child->level + 1; 5951 sd->level = child->level + 1;
6517 sched_domain_level_max = max(sched_domain_level_max, sd->level); 5952 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6518 child->parent = sd; 5953 child->parent = sd;
5954 sd->child = child;
6519 } 5955 }
6520 sd->child = child;
6521 set_domain_attribute(sd, attr); 5956 set_domain_attribute(sd, attr);
6522 5957
6523 return sd; 5958 return sd;
@@ -6530,7 +5965,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6530static int build_sched_domains(const struct cpumask *cpu_map, 5965static int build_sched_domains(const struct cpumask *cpu_map,
6531 struct sched_domain_attr *attr) 5966 struct sched_domain_attr *attr)
6532{ 5967{
6533 enum s_alloc alloc_state = sa_none; 5968 enum s_alloc alloc_state;
6534 struct sched_domain *sd; 5969 struct sched_domain *sd;
6535 struct s_data d; 5970 struct s_data d;
6536 int i, ret = -ENOMEM; 5971 int i, ret = -ENOMEM;
@@ -6544,18 +5979,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6544 struct sched_domain_topology_level *tl; 5979 struct sched_domain_topology_level *tl;
6545 5980
6546 sd = NULL; 5981 sd = NULL;
6547 for (tl = sched_domain_topology; tl->init; tl++) { 5982 for_each_sd_topology(tl) {
6548 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 5983 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
5984 if (tl == sched_domain_topology)
5985 *per_cpu_ptr(d.sd, i) = sd;
6549 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 5986 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6550 sd->flags |= SD_OVERLAP; 5987 sd->flags |= SD_OVERLAP;
6551 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 5988 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6552 break; 5989 break;
6553 } 5990 }
6554
6555 while (sd->child)
6556 sd = sd->child;
6557
6558 *per_cpu_ptr(d.sd, i) = sd;
6559 } 5991 }
6560 5992
6561 /* Build the groups for the domains */ 5993 /* Build the groups for the domains */
@@ -6867,9 +6299,6 @@ void __init sched_init_smp(void)
6867 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6299 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6868 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6300 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6869 6301
6870 /* RT runtime code needs to handle some hotplug events */
6871 hotcpu_notifier(update_runtime, 0);
6872
6873 init_hrtick(); 6302 init_hrtick();
6874 6303
6875 /* Move init over to a non-isolated CPU */ 6304 /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b5ccba22603b..a7959e05a9d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515 515
516 for (;;) { 516 for (;;) {
517 /* Make sure "rtime" is the bigger of stime/rtime */ 517 /* Make sure "rtime" is the bigger of stime/rtime */
518 if (stime > rtime) { 518 if (stime > rtime)
519 u64 tmp = rtime; rtime = stime; stime = tmp; 519 swap(rtime, stime);
520 }
521 520
522 /* Make sure 'total' fits in 32 bits */ 521 /* Make sure 'total' fits in 32 bits */
523 if (total >> 32) 522 if (total >> 32)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a673520..e076bddd4c66 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
209 cfs_rq->nr_spread_over); 209 cfs_rq->nr_spread_over);
210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
212#ifdef CONFIG_FAIR_GROUP_SCHED
213#ifdef CONFIG_SMP 212#ifdef CONFIG_SMP
214 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", 213 SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
215 cfs_rq->runnable_load_avg); 214 cfs_rq->runnable_load_avg);
216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", 215 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
217 cfs_rq->blocked_load_avg); 216 cfs_rq->blocked_load_avg);
218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", 217#ifdef CONFIG_FAIR_GROUP_SCHED
219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); 218 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
221 cfs_rq->tg_load_contrib); 219 cfs_rq->tg_load_contrib);
222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", 220 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
223 cfs_rq->tg_runnable_contrib); 221 cfs_rq->tg_runnable_contrib);
222 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
223 atomic_long_read(&cfs_rq->tg->load_avg));
224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", 224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
225 atomic_read(&cfs_rq->tg->runnable_avg)); 225 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 226#endif
227#endif
227 228
229#ifdef CONFIG_FAIR_GROUP_SCHED
228 print_cfs_group_stats(m, cpu, cfs_rq->tg); 230 print_cfs_group_stats(m, cpu, cfs_rq->tg);
229#endif 231#endif
230} 232}
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
493 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
494 get_nr_threads(p)); 496 get_nr_threads(p));
495 SEQ_printf(m, 497 SEQ_printf(m,
496 "---------------------------------------------------------\n"); 498 "---------------------------------------------------------"
499 "----------\n");
497#define __P(F) \ 500#define __P(F) \
498 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) 501 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
499#define P(F) \ 502#define P(F) \
500 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) 503 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
501#define __PN(F) \ 504#define __PN(F) \
502 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 505 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
503#define PN(F) \ 506#define PN(F) \
504 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 507 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
505 508
506 PN(se.exec_start); 509 PN(se.exec_start);
507 PN(se.vruntime); 510 PN(se.vruntime);
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
560 } 563 }
561#endif 564#endif
562 __P(nr_switches); 565 __P(nr_switches);
563 SEQ_printf(m, "%-35s:%21Ld\n", 566 SEQ_printf(m, "%-45s:%21Ld\n",
564 "nr_voluntary_switches", (long long)p->nvcsw); 567 "nr_voluntary_switches", (long long)p->nvcsw);
565 SEQ_printf(m, "%-35s:%21Ld\n", 568 SEQ_printf(m, "%-45s:%21Ld\n",
566 "nr_involuntary_switches", (long long)p->nivcsw); 569 "nr_involuntary_switches", (long long)p->nivcsw);
567 570
568 P(se.load.weight); 571 P(se.load.weight);
572#ifdef CONFIG_SMP
573 P(se.avg.runnable_avg_sum);
574 P(se.avg.runnable_avg_period);
575 P(se.avg.load_avg_contrib);
576 P(se.avg.decay_count);
577#endif
569 P(policy); 578 P(policy);
570 P(prio); 579 P(prio);
571#undef PN 580#undef PN
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
579 588
580 t0 = cpu_clock(this_cpu); 589 t0 = cpu_clock(this_cpu);
581 t1 = cpu_clock(this_cpu); 590 t1 = cpu_clock(this_cpu);
582 SEQ_printf(m, "%-35s:%21Ld\n", 591 SEQ_printf(m, "%-45s:%21Ld\n",
583 "clock-delta", (long long)(t1-t0)); 592 "clock-delta", (long long)(t1-t0));
584 } 593 }
585} 594}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c8..f77f9c527449 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif 114#endif
115 115
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
116/* 134/*
117 * Increase the granularity value when there are more CPUs, 135 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible 136 * because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
662 return calc_delta_fair(sched_slice(cfs_rq, se), se); 680 return calc_delta_fair(sched_slice(cfs_rq, se), se);
663} 681}
664 682
683#ifdef CONFIG_SMP
684static inline void __update_task_entity_contrib(struct sched_entity *se);
685
686/* Give new task start runnable values to heavy its load in infant time */
687void init_task_runnable_average(struct task_struct *p)
688{
689 u32 slice;
690
691 p->se.avg.decay_count = 0;
692 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 p->se.avg.runnable_avg_sum = slice;
694 p->se.avg.runnable_avg_period = slice;
695 __update_task_entity_contrib(&p->se);
696}
697#else
698void init_task_runnable_average(struct task_struct *p)
699{
700}
701#endif
702
665/* 703/*
666 * Update the current task's runtime statistics. Skip current tasks that 704 * Update the current task's runtime statistics. Skip current tasks that
667 * are not in our scheduling class. 705 * are not in our scheduling class.
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
686static void update_curr(struct cfs_rq *cfs_rq) 724static void update_curr(struct cfs_rq *cfs_rq)
687{ 725{
688 struct sched_entity *curr = cfs_rq->curr; 726 struct sched_entity *curr = cfs_rq->curr;
689 u64 now = rq_of(cfs_rq)->clock_task; 727 u64 now = rq_clock_task(rq_of(cfs_rq));
690 unsigned long delta_exec; 728 unsigned long delta_exec;
691 729
692 if (unlikely(!curr)) 730 if (unlikely(!curr))
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
718static inline void 756static inline void
719update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 757update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{ 758{
721 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); 759 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
722} 760}
723 761
724/* 762/*
@@ -738,14 +776,14 @@ static void
738update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 776update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
739{ 777{
740 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, 778 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
741 rq_of(cfs_rq)->clock - se->statistics.wait_start)); 779 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
742 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); 780 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
743 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + 781 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
744 rq_of(cfs_rq)->clock - se->statistics.wait_start); 782 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
745#ifdef CONFIG_SCHEDSTATS 783#ifdef CONFIG_SCHEDSTATS
746 if (entity_is_task(se)) { 784 if (entity_is_task(se)) {
747 trace_sched_stat_wait(task_of(se), 785 trace_sched_stat_wait(task_of(se),
748 rq_of(cfs_rq)->clock - se->statistics.wait_start); 786 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
749 } 787 }
750#endif 788#endif
751 schedstat_set(se->statistics.wait_start, 0); 789 schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
771 /* 809 /*
772 * We are starting a new run period: 810 * We are starting a new run period:
773 */ 811 */
774 se->exec_start = rq_of(cfs_rq)->clock_task; 812 se->exec_start = rq_clock_task(rq_of(cfs_rq));
775} 813}
776 814
777/************************************************** 815/**************************************************
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1037 * to gain a more accurate current total weight. See 1075 * to gain a more accurate current total weight. See
1038 * update_cfs_rq_load_contribution(). 1076 * update_cfs_rq_load_contribution().
1039 */ 1077 */
1040 tg_weight = atomic64_read(&tg->load_avg); 1078 tg_weight = atomic_long_read(&tg->load_avg);
1041 tg_weight -= cfs_rq->tg_load_contrib; 1079 tg_weight -= cfs_rq->tg_load_contrib;
1042 tg_weight += cfs_rq->load.weight; 1080 tg_weight += cfs_rq->load.weight;
1043 1081
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1110} 1148}
1111#endif /* CONFIG_FAIR_GROUP_SCHED */ 1149#endif /* CONFIG_FAIR_GROUP_SCHED */
1112 1150
1113/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ 1151#ifdef CONFIG_SMP
1114#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1115/* 1152/*
1116 * We choose a half-life close to 1 scheduling period. 1153 * We choose a half-life close to 1 scheduling period.
1117 * Note: The tables below are dependent on this value. 1154 * Note: The tables below are dependent on this value.
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1319 int force_update) 1356 int force_update)
1320{ 1357{
1321 struct task_group *tg = cfs_rq->tg; 1358 struct task_group *tg = cfs_rq->tg;
1322 s64 tg_contrib; 1359 long tg_contrib;
1323 1360
1324 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 1361 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1325 tg_contrib -= cfs_rq->tg_load_contrib; 1362 tg_contrib -= cfs_rq->tg_load_contrib;
1326 1363
1327 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 1364 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1328 atomic64_add(tg_contrib, &tg->load_avg); 1365 atomic_long_add(tg_contrib, &tg->load_avg);
1329 cfs_rq->tg_load_contrib += tg_contrib; 1366 cfs_rq->tg_load_contrib += tg_contrib;
1330 } 1367 }
1331} 1368}
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
1360 u64 contrib; 1397 u64 contrib;
1361 1398
1362 contrib = cfs_rq->tg_load_contrib * tg->shares; 1399 contrib = cfs_rq->tg_load_contrib * tg->shares;
1363 se->avg.load_avg_contrib = div64_u64(contrib, 1400 se->avg.load_avg_contrib = div_u64(contrib,
1364 atomic64_read(&tg->load_avg) + 1); 1401 atomic_long_read(&tg->load_avg) + 1);
1365 1402
1366 /* 1403 /*
1367 * For group entities we need to compute a correction term in the case 1404 * For group entities we need to compute a correction term in the case
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1480 if (!decays && !force_update) 1517 if (!decays && !force_update)
1481 return; 1518 return;
1482 1519
1483 if (atomic64_read(&cfs_rq->removed_load)) { 1520 if (atomic_long_read(&cfs_rq->removed_load)) {
1484 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); 1521 unsigned long removed_load;
1522 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
1485 subtract_blocked_load_contrib(cfs_rq, removed_load); 1523 subtract_blocked_load_contrib(cfs_rq, removed_load);
1486 } 1524 }
1487 1525
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1497 1535
1498static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 1536static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1499{ 1537{
1500 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); 1538 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
1501 __update_tg_runnable_avg(&rq->avg, &rq->cfs); 1539 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1502} 1540}
1503 1541
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1510 * We track migrations using entity decay_count <= 0, on a wake-up 1548 * We track migrations using entity decay_count <= 0, on a wake-up
1511 * migration we use a negative decay count to track the remote decays 1549 * migration we use a negative decay count to track the remote decays
1512 * accumulated while sleeping. 1550 * accumulated while sleeping.
1551 *
1552 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
1553 * are seen by enqueue_entity_load_avg() as a migration with an already
1554 * constructed load_avg_contrib.
1513 */ 1555 */
1514 if (unlikely(se->avg.decay_count <= 0)) { 1556 if (unlikely(se->avg.decay_count <= 0)) {
1515 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; 1557 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
1516 if (se->avg.decay_count) { 1558 if (se->avg.decay_count) {
1517 /* 1559 /*
1518 * In a wake-up migration we have to approximate the 1560 * In a wake-up migration we have to approximate the
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1530 } 1572 }
1531 wakeup = 0; 1573 wakeup = 0;
1532 } else { 1574 } else {
1533 __synchronize_entity_decay(se); 1575 /*
1576 * Task re-woke on same cpu (or else migrate_task_rq_fair()
1577 * would have made count negative); we must be careful to avoid
1578 * double-accounting blocked time after synchronizing decays.
1579 */
1580 se->avg.last_runnable_update += __synchronize_entity_decay(se)
1581 << 20;
1534 } 1582 }
1535 1583
1536 /* migrated tasks did not contribute to our blocked load */ 1584 /* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1607 tsk = task_of(se); 1655 tsk = task_of(se);
1608 1656
1609 if (se->statistics.sleep_start) { 1657 if (se->statistics.sleep_start) {
1610 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; 1658 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
1611 1659
1612 if ((s64)delta < 0) 1660 if ((s64)delta < 0)
1613 delta = 0; 1661 delta = 0;
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1624 } 1672 }
1625 } 1673 }
1626 if (se->statistics.block_start) { 1674 if (se->statistics.block_start) {
1627 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; 1675 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
1628 1676
1629 if ((s64)delta < 0) 1677 if ((s64)delta < 0)
1630 delta = 0; 1678 delta = 0;
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1712{ 1760{
1713 /* 1761 /*
1714 * Update the normalized vruntime before updating min_vruntime 1762 * Update the normalized vruntime before updating min_vruntime
1715 * through callig update_curr(). 1763 * through calling update_curr().
1716 */ 1764 */
1717 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) 1765 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1718 se->vruntime += cfs_rq->min_vruntime; 1766 se->vruntime += cfs_rq->min_vruntime;
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1805 struct task_struct *tsk = task_of(se); 1853 struct task_struct *tsk = task_of(se);
1806 1854
1807 if (tsk->state & TASK_INTERRUPTIBLE) 1855 if (tsk->state & TASK_INTERRUPTIBLE)
1808 se->statistics.sleep_start = rq_of(cfs_rq)->clock; 1856 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
1809 if (tsk->state & TASK_UNINTERRUPTIBLE) 1857 if (tsk->state & TASK_UNINTERRUPTIBLE)
1810 se->statistics.block_start = rq_of(cfs_rq)->clock; 1858 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
1811 } 1859 }
1812#endif 1860#endif
1813 } 1861 }
@@ -2082,7 +2130,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2082 if (unlikely(cfs_rq->throttle_count)) 2130 if (unlikely(cfs_rq->throttle_count))
2083 return cfs_rq->throttled_clock_task; 2131 return cfs_rq->throttled_clock_task;
2084 2132
2085 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; 2133 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2086} 2134}
2087 2135
2088/* returns 0 on failure to allocate runtime */ 2136/* returns 0 on failure to allocate runtime */
@@ -2138,10 +2186,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2138static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2186static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139{ 2187{
2140 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 2188 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2141 struct rq *rq = rq_of(cfs_rq);
2142 2189
2143 /* if the deadline is ahead of our clock, nothing to do */ 2190 /* if the deadline is ahead of our clock, nothing to do */
2144 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) 2191 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2145 return; 2192 return;
2146 2193
2147 if (cfs_rq->runtime_remaining < 0) 2194 if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +2277,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
2230#ifdef CONFIG_SMP 2277#ifdef CONFIG_SMP
2231 if (!cfs_rq->throttle_count) { 2278 if (!cfs_rq->throttle_count) {
2232 /* adjust cfs_rq_clock_task() */ 2279 /* adjust cfs_rq_clock_task() */
2233 cfs_rq->throttled_clock_task_time += rq->clock_task - 2280 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
2234 cfs_rq->throttled_clock_task; 2281 cfs_rq->throttled_clock_task;
2235 } 2282 }
2236#endif 2283#endif
@@ -2245,7 +2292,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
2245 2292
2246 /* group is entering throttled state, stop time */ 2293 /* group is entering throttled state, stop time */
2247 if (!cfs_rq->throttle_count) 2294 if (!cfs_rq->throttle_count)
2248 cfs_rq->throttled_clock_task = rq->clock_task; 2295 cfs_rq->throttled_clock_task = rq_clock_task(rq);
2249 cfs_rq->throttle_count++; 2296 cfs_rq->throttle_count++;
2250 2297
2251 return 0; 2298 return 0;
@@ -2284,7 +2331,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2284 rq->nr_running -= task_delta; 2331 rq->nr_running -= task_delta;
2285 2332
2286 cfs_rq->throttled = 1; 2333 cfs_rq->throttled = 1;
2287 cfs_rq->throttled_clock = rq->clock; 2334 cfs_rq->throttled_clock = rq_clock(rq);
2288 raw_spin_lock(&cfs_b->lock); 2335 raw_spin_lock(&cfs_b->lock);
2289 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2336 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2290 raw_spin_unlock(&cfs_b->lock); 2337 raw_spin_unlock(&cfs_b->lock);
@@ -2298,15 +2345,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2298 int enqueue = 1; 2345 int enqueue = 1;
2299 long task_delta; 2346 long task_delta;
2300 2347
2301 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2348 se = cfs_rq->tg->se[cpu_of(rq)];
2302 2349
2303 cfs_rq->throttled = 0; 2350 cfs_rq->throttled = 0;
2351
2352 update_rq_clock(rq);
2353
2304 raw_spin_lock(&cfs_b->lock); 2354 raw_spin_lock(&cfs_b->lock);
2305 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; 2355 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
2306 list_del_rcu(&cfs_rq->throttled_list); 2356 list_del_rcu(&cfs_rq->throttled_list);
2307 raw_spin_unlock(&cfs_b->lock); 2357 raw_spin_unlock(&cfs_b->lock);
2308 2358
2309 update_rq_clock(rq);
2310 /* update hierarchical throttle state */ 2359 /* update hierarchical throttle state */
2311 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); 2360 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2312 2361
@@ -2599,10 +2648,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2599 throttle_cfs_rq(cfs_rq); 2648 throttle_cfs_rq(cfs_rq);
2600} 2649}
2601 2650
2602static inline u64 default_cfs_period(void);
2603static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2604static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2605
2606static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 2651static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2607{ 2652{
2608 struct cfs_bandwidth *cfs_b = 2653 struct cfs_bandwidth *cfs_b =
@@ -2706,7 +2751,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2706#else /* CONFIG_CFS_BANDWIDTH */ 2751#else /* CONFIG_CFS_BANDWIDTH */
2707static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 2752static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2708{ 2753{
2709 return rq_of(cfs_rq)->clock_task; 2754 return rq_clock_task(rq_of(cfs_rq));
2710} 2755}
2711 2756
2712static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2757static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -2919,7 +2964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2919/* Used instead of source_load when we know the type == 0 */ 2964/* Used instead of source_load when we know the type == 0 */
2920static unsigned long weighted_cpuload(const int cpu) 2965static unsigned long weighted_cpuload(const int cpu)
2921{ 2966{
2922 return cpu_rq(cpu)->load.weight; 2967 return cpu_rq(cpu)->cfs.runnable_load_avg;
2923} 2968}
2924 2969
2925/* 2970/*
@@ -2964,9 +3009,10 @@ static unsigned long cpu_avg_load_per_task(int cpu)
2964{ 3009{
2965 struct rq *rq = cpu_rq(cpu); 3010 struct rq *rq = cpu_rq(cpu);
2966 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 3011 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3012 unsigned long load_avg = rq->cfs.runnable_load_avg;
2967 3013
2968 if (nr_running) 3014 if (nr_running)
2969 return rq->load.weight / nr_running; 3015 return load_avg / nr_running;
2970 3016
2971 return 0; 3017 return 0;
2972} 3018}
@@ -3416,12 +3462,6 @@ unlock:
3416} 3462}
3417 3463
3418/* 3464/*
3419 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3420 * removed when useful for applications beyond shares distribution (e.g.
3421 * load-balance).
3422 */
3423#ifdef CONFIG_FAIR_GROUP_SCHED
3424/*
3425 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 3465 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3426 * cfs_rq_of(p) references at time of call are still valid and identify the 3466 * cfs_rq_of(p) references at time of call are still valid and identify the
3427 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 3467 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
@@ -3441,10 +3481,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3441 */ 3481 */
3442 if (se->avg.decay_count) { 3482 if (se->avg.decay_count) {
3443 se->avg.decay_count = -__synchronize_entity_decay(se); 3483 se->avg.decay_count = -__synchronize_entity_decay(se);
3444 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); 3484 atomic_long_add(se->avg.load_avg_contrib,
3485 &cfs_rq->removed_load);
3445 } 3486 }
3446} 3487}
3447#endif
3448#endif /* CONFIG_SMP */ 3488#endif /* CONFIG_SMP */
3449 3489
3450static unsigned long 3490static unsigned long
@@ -3946,7 +3986,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3946 * 2) too many balance attempts have failed. 3986 * 2) too many balance attempts have failed.
3947 */ 3987 */
3948 3988
3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3989 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
3950 if (!tsk_cache_hot || 3990 if (!tsk_cache_hot ||
3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3991 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3952 3992
@@ -4141,11 +4181,11 @@ static int tg_load_down(struct task_group *tg, void *data)
4141 long cpu = (long)data; 4181 long cpu = (long)data;
4142 4182
4143 if (!tg->parent) { 4183 if (!tg->parent) {
4144 load = cpu_rq(cpu)->load.weight; 4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4145 } else { 4185 } else {
4146 load = tg->parent->cfs_rq[cpu]->h_load; 4186 load = tg->parent->cfs_rq[cpu]->h_load;
4147 load *= tg->se[cpu]->load.weight; 4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4148 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4149 } 4189 }
4150 4190
4151 tg->cfs_rq[cpu]->h_load = load; 4191 tg->cfs_rq[cpu]->h_load = load;
@@ -4171,12 +4211,9 @@ static void update_h_load(long cpu)
4171static unsigned long task_h_load(struct task_struct *p) 4211static unsigned long task_h_load(struct task_struct *p)
4172{ 4212{
4173 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4213 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4174 unsigned long load;
4175
4176 load = p->se.load.weight;
4177 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4178 4214
4179 return load; 4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1);
4180} 4217}
4181#else 4218#else
4182static inline void update_blocked_averages(int cpu) 4219static inline void update_blocked_averages(int cpu)
@@ -4189,7 +4226,7 @@ static inline void update_h_load(long cpu)
4189 4226
4190static unsigned long task_h_load(struct task_struct *p) 4227static unsigned long task_h_load(struct task_struct *p)
4191{ 4228{
4192 return p->se.load.weight; 4229 return p->se.avg.load_avg_contrib;
4193} 4230}
4194#endif 4231#endif
4195 4232
@@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu)
4302 age_stamp = ACCESS_ONCE(rq->age_stamp); 4339 age_stamp = ACCESS_ONCE(rq->age_stamp);
4303 avg = ACCESS_ONCE(rq->rt_avg); 4340 avg = ACCESS_ONCE(rq->rt_avg);
4304 4341
4305 total = sched_avg_period() + (rq->clock - age_stamp); 4342 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
4306 4343
4307 if (unlikely(total < avg)) { 4344 if (unlikely(total < avg)) {
4308 /* Ensures that power won't end up being negative */ 4345 /* Ensures that power won't end up being negative */
@@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5241 int pulled_task = 0; 5278 int pulled_task = 0;
5242 unsigned long next_balance = jiffies + HZ; 5279 unsigned long next_balance = jiffies + HZ;
5243 5280
5244 this_rq->idle_stamp = this_rq->clock; 5281 this_rq->idle_stamp = rq_clock(this_rq);
5245 5282
5246 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5283 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5247 return; 5284 return;
@@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu)
5418static inline void set_cpu_sd_state_busy(void) 5455static inline void set_cpu_sd_state_busy(void)
5419{ 5456{
5420 struct sched_domain *sd; 5457 struct sched_domain *sd;
5421 int cpu = smp_processor_id();
5422 5458
5423 rcu_read_lock(); 5459 rcu_read_lock();
5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5460 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5425 5461
5426 if (!sd || !sd->nohz_idle) 5462 if (!sd || !sd->nohz_idle)
5427 goto unlock; 5463 goto unlock;
@@ -5436,10 +5472,9 @@ unlock:
5436void set_cpu_sd_state_idle(void) 5472void set_cpu_sd_state_idle(void)
5437{ 5473{
5438 struct sched_domain *sd; 5474 struct sched_domain *sd;
5439 int cpu = smp_processor_id();
5440 5475
5441 rcu_read_lock(); 5476 rcu_read_lock();
5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5477 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5443 5478
5444 if (!sd || sd->nohz_idle) 5479 if (!sd || sd->nohz_idle)
5445 goto unlock; 5480 goto unlock;
@@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5848 se->vruntime -= cfs_rq->min_vruntime; 5883 se->vruntime -= cfs_rq->min_vruntime;
5849 } 5884 }
5850 5885
5851#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5886#ifdef CONFIG_SMP
5852 /* 5887 /*
5853 * Remove our load from contribution when we leave sched_fair 5888 * Remove our load from contribution when we leave sched_fair
5854 * and ensure we don't carry in an old decay_count if we 5889 * and ensure we don't carry in an old decay_count if we
@@ -5907,9 +5942,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5907#ifndef CONFIG_64BIT 5942#ifndef CONFIG_64BIT
5908 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5943 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5909#endif 5944#endif
5910#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5945#ifdef CONFIG_SMP
5911 atomic64_set(&cfs_rq->decay_counter, 1); 5946 atomic64_set(&cfs_rq->decay_counter, 1);
5912 atomic64_set(&cfs_rq->removed_load, 0); 5947 atomic_long_set(&cfs_rq->removed_load, 0);
5913#endif 5948#endif
5914} 5949}
5915 5950
@@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6091 se = tg->se[i]; 6126 se = tg->se[i];
6092 /* Propagate contribution to hierarchy */ 6127 /* Propagate contribution to hierarchy */
6093 raw_spin_lock_irqsave(&rq->lock, flags); 6128 raw_spin_lock_irqsave(&rq->lock, flags);
6129
6130 /* Possible calls to update_curr() need rq clock */
6131 update_rq_clock(rq);
6094 for_each_sched_entity(se) 6132 for_each_sched_entity(se)
6095 update_cfs_shares(group_cfs_rq(se)); 6133 update_cfs_shares(group_cfs_rq(se));
6096 raw_spin_unlock_irqrestore(&rq->lock, flags); 6134 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = {
6146 6184
6147#ifdef CONFIG_SMP 6185#ifdef CONFIG_SMP
6148 .select_task_rq = select_task_rq_fair, 6186 .select_task_rq = select_task_rq_fair,
6149#ifdef CONFIG_FAIR_GROUP_SCHED
6150 .migrate_task_rq = migrate_task_rq_fair, 6187 .migrate_task_rq = migrate_task_rq_fair,
6151#endif 6188
6152 .rq_online = rq_online_fair, 6189 .rq_online = rq_online_fair,
6153 .rq_offline = rq_offline_fair, 6190 .rq_offline = rq_offline_fair,
6154 6191
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 000000000000..16f5a30f9c88
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
1/*
2 * kernel/sched/proc.c
3 *
4 * Kernel load calculations, forked from sched/core.c
5 */
6
7#include <linux/export.h>
8
9#include "sched.h"
10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/*
19 * Global load-average calculations
20 *
21 * We take a distributed and async approach to calculating the global load-avg
22 * in order to minimize overhead.
23 *
24 * The global load average is an exponentially decaying average of nr_running +
25 * nr_uninterruptible.
26 *
27 * Once every LOAD_FREQ:
28 *
29 * nr_active = 0;
30 * for_each_possible_cpu(cpu)
31 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
32 *
33 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
34 *
35 * Due to a number of reasons the above turns in the mess below:
36 *
37 * - for_each_possible_cpu() is prohibitively expensive on machines with
38 * serious number of cpus, therefore we need to take a distributed approach
39 * to calculating nr_active.
40 *
41 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
42 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
43 *
44 * So assuming nr_active := 0 when we start out -- true per definition, we
45 * can simply take per-cpu deltas and fold those into a global accumulate
46 * to obtain the same result. See calc_load_fold_active().
47 *
48 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
49 * across the machine, we assume 10 ticks is sufficient time for every
50 * cpu to have completed this task.
51 *
52 * This places an upper-bound on the IRQ-off latency of the machine. Then
53 * again, being late doesn't loose the delta, just wrecks the sample.
54 *
55 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
56 * this would add another cross-cpu cacheline miss and atomic operation
57 * to the wakeup path. Instead we increment on whatever cpu the task ran
58 * when it went into uninterruptible state and decrement on whatever cpu
59 * did the wakeup. This means that only the sum of nr_uninterruptible over
60 * all cpus yields the correct result.
61 *
62 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
63 */
64
65/* Variables and functions for calc_load */
66atomic_long_t calc_load_tasks;
67unsigned long calc_load_update;
68unsigned long avenrun[3];
69EXPORT_SYMBOL(avenrun); /* should be removed */
70
71/**
72 * get_avenrun - get the load average array
73 * @loads: pointer to dest load array
74 * @offset: offset to add
75 * @shift: shift count to shift the result left
76 *
77 * These values are estimates at best, so no need for locking.
78 */
79void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
80{
81 loads[0] = (avenrun[0] + offset) << shift;
82 loads[1] = (avenrun[1] + offset) << shift;
83 loads[2] = (avenrun[2] + offset) << shift;
84}
85
86long calc_load_fold_active(struct rq *this_rq)
87{
88 long nr_active, delta = 0;
89
90 nr_active = this_rq->nr_running;
91 nr_active += (long) this_rq->nr_uninterruptible;
92
93 if (nr_active != this_rq->calc_load_active) {
94 delta = nr_active - this_rq->calc_load_active;
95 this_rq->calc_load_active = nr_active;
96 }
97
98 return delta;
99}
100
101/*
102 * a1 = a0 * e + a * (1 - e)
103 */
104static unsigned long
105calc_load(unsigned long load, unsigned long exp, unsigned long active)
106{
107 load *= exp;
108 load += active * (FIXED_1 - exp);
109 load += 1UL << (FSHIFT - 1);
110 return load >> FSHIFT;
111}
112
113#ifdef CONFIG_NO_HZ_COMMON
114/*
115 * Handle NO_HZ for the global load-average.
116 *
117 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by
119 * NO_HZ.
120 *
121 * The basic idea is to fold the nr_active delta into a global idle-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
123 * when we read the global state.
124 *
125 * Obviously reality has to ruin such a delightfully simple scheme:
126 *
127 * - When we go NO_HZ idle during the window, we can negate our sample
128 * contribution, causing under-accounting.
129 *
130 * We avoid this by keeping two idle-delta counters and flipping them
131 * when the window starts, thus separating old and new NO_HZ load.
132 *
133 * The only trick is the slight shift in index flip for read vs write.
134 *
135 * 0s 5s 10s 15s
136 * +10 +10 +10 +10
137 * |-|-----------|-|-----------|-|-----------|-|
138 * r:0 0 1 1 0 0 1 1 0
139 * w:0 1 1 0 0 1 1 0 0
140 *
141 * This ensures we'll fold the old idle contribution in this window while
142 * accumlating the new one.
143 *
144 * - When we wake up from NO_HZ idle during the window, we push up our
145 * contribution, since we effectively move our sample point to a known
146 * busy state.
147 *
148 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the idle-delta for this cpu which
150 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NOHZ idle for multiple
152 * LOAD_FREQ intervals.
153 *
154 * When making the ILB scale, we should try to pull this in as well.
155 */
156static atomic_long_t calc_load_idle[2];
157static int calc_load_idx;
158
159static inline int calc_load_write_idx(void)
160{
161 int idx = calc_load_idx;
162
163 /*
164 * See calc_global_nohz(), if we observe the new index, we also
165 * need to observe the new update time.
166 */
167 smp_rmb();
168
169 /*
170 * If the folding window started, make sure we start writing in the
171 * next idle-delta.
172 */
173 if (!time_before(jiffies, calc_load_update))
174 idx++;
175
176 return idx & 1;
177}
178
179static inline int calc_load_read_idx(void)
180{
181 return calc_load_idx & 1;
182}
183
184void calc_load_enter_idle(void)
185{
186 struct rq *this_rq = this_rq();
187 long delta;
188
189 /*
190 * We're going into NOHZ mode, if there's any pending delta, fold it
191 * into the pending idle delta.
192 */
193 delta = calc_load_fold_active(this_rq);
194 if (delta) {
195 int idx = calc_load_write_idx();
196 atomic_long_add(delta, &calc_load_idle[idx]);
197 }
198}
199
200void calc_load_exit_idle(void)
201{
202 struct rq *this_rq = this_rq();
203
204 /*
205 * If we're still before the sample window, we're done.
206 */
207 if (time_before(jiffies, this_rq->calc_load_update))
208 return;
209
210 /*
211 * We woke inside or after the sample window, this means we're already
212 * accounted through the nohz accounting, so skip the entire deal and
213 * sync up for the next window.
214 */
215 this_rq->calc_load_update = calc_load_update;
216 if (time_before(jiffies, this_rq->calc_load_update + 10))
217 this_rq->calc_load_update += LOAD_FREQ;
218}
219
220static long calc_load_fold_idle(void)
221{
222 int idx = calc_load_read_idx();
223 long delta = 0;
224
225 if (atomic_long_read(&calc_load_idle[idx]))
226 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
227
228 return delta;
229}
230
231/**
232 * fixed_power_int - compute: x^n, in O(log n) time
233 *
234 * @x: base of the power
235 * @frac_bits: fractional bits of @x
236 * @n: power to raise @x to.
237 *
238 * By exploiting the relation between the definition of the natural power
239 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
240 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
241 * (where: n_i \elem {0, 1}, the binary vector representing n),
242 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
243 * of course trivially computable in O(log_2 n), the length of our binary
244 * vector.
245 */
246static unsigned long
247fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
248{
249 unsigned long result = 1UL << frac_bits;
250
251 if (n) for (;;) {
252 if (n & 1) {
253 result *= x;
254 result += 1UL << (frac_bits - 1);
255 result >>= frac_bits;
256 }
257 n >>= 1;
258 if (!n)
259 break;
260 x *= x;
261 x += 1UL << (frac_bits - 1);
262 x >>= frac_bits;
263 }
264
265 return result;
266}
267
268/*
269 * a1 = a0 * e + a * (1 - e)
270 *
271 * a2 = a1 * e + a * (1 - e)
272 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
273 * = a0 * e^2 + a * (1 - e) * (1 + e)
274 *
275 * a3 = a2 * e + a * (1 - e)
276 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
277 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
278 *
279 * ...
280 *
281 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
282 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
283 * = a0 * e^n + a * (1 - e^n)
284 *
285 * [1] application of the geometric series:
286 *
287 * n 1 - x^(n+1)
288 * S_n := \Sum x^i = -------------
289 * i=0 1 - x
290 */
291static unsigned long
292calc_load_n(unsigned long load, unsigned long exp,
293 unsigned long active, unsigned int n)
294{
295
296 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
297}
298
299/*
300 * NO_HZ can leave us missing all per-cpu ticks calling
301 * calc_load_account_active(), but since an idle CPU folds its delta into
302 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
303 * in the pending idle delta if our idle period crossed a load cycle boundary.
304 *
305 * Once we've updated the global active value, we need to apply the exponential
306 * weights adjusted to the number of cycles missed.
307 */
308static void calc_global_nohz(void)
309{
310 long delta, active, n;
311
312 if (!time_before(jiffies, calc_load_update + 10)) {
313 /*
314 * Catch-up, fold however many we are behind still
315 */
316 delta = jiffies - calc_load_update - 10;
317 n = 1 + (delta / LOAD_FREQ);
318
319 active = atomic_long_read(&calc_load_tasks);
320 active = active > 0 ? active * FIXED_1 : 0;
321
322 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
323 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
324 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
325
326 calc_load_update += n * LOAD_FREQ;
327 }
328
329 /*
330 * Flip the idle index...
331 *
332 * Make sure we first write the new time then flip the index, so that
333 * calc_load_write_idx() will see the new time when it reads the new
334 * index, this avoids a double flip messing things up.
335 */
336 smp_wmb();
337 calc_load_idx++;
338}
339#else /* !CONFIG_NO_HZ_COMMON */
340
341static inline long calc_load_fold_idle(void) { return 0; }
342static inline void calc_global_nohz(void) { }
343
344#endif /* CONFIG_NO_HZ_COMMON */
345
346/*
347 * calc_load - update the avenrun load estimates 10 ticks after the
348 * CPUs have updated calc_load_tasks.
349 */
350void calc_global_load(unsigned long ticks)
351{
352 long active, delta;
353
354 if (time_before(jiffies, calc_load_update + 10))
355 return;
356
357 /*
358 * Fold the 'old' idle-delta to include all NO_HZ cpus.
359 */
360 delta = calc_load_fold_idle();
361 if (delta)
362 atomic_long_add(delta, &calc_load_tasks);
363
364 active = atomic_long_read(&calc_load_tasks);
365 active = active > 0 ? active * FIXED_1 : 0;
366
367 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
368 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
369 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
370
371 calc_load_update += LOAD_FREQ;
372
373 /*
374 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
375 */
376 calc_global_nohz();
377}
378
379/*
380 * Called from update_cpu_load() to periodically update this CPU's
381 * active count.
382 */
383static void calc_load_account_active(struct rq *this_rq)
384{
385 long delta;
386
387 if (time_before(jiffies, this_rq->calc_load_update))
388 return;
389
390 delta = calc_load_fold_active(this_rq);
391 if (delta)
392 atomic_long_add(delta, &calc_load_tasks);
393
394 this_rq->calc_load_update += LOAD_FREQ;
395}
396
397/*
398 * End of global load-average stuff
399 */
400
401/*
402 * The exact cpuload at various idx values, calculated at every tick would be
403 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
404 *
405 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
406 * on nth tick when cpu may be busy, then we have:
407 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
408 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
409 *
410 * decay_load_missed() below does efficient calculation of
411 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
412 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
413 *
414 * The calculation is approximated on a 128 point scale.
415 * degrade_zero_ticks is the number of ticks after which load at any
416 * particular idx is approximated to be zero.
417 * degrade_factor is a precomputed table, a row for each load idx.
418 * Each column corresponds to degradation factor for a power of two ticks,
419 * based on 128 point scale.
420 * Example:
421 * row 2, col 3 (=12) says that the degradation at load idx 2 after
422 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
423 *
424 * With this power of 2 load factors, we can degrade the load n times
425 * by looking at 1 bits in n and doing as many mult/shift instead of
426 * n mult/shifts needed by the exact degradation.
427 */
428#define DEGRADE_SHIFT 7
429static const unsigned char
430 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
431static const unsigned char
432 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
433 {0, 0, 0, 0, 0, 0, 0, 0},
434 {64, 32, 8, 0, 0, 0, 0, 0},
435 {96, 72, 40, 12, 1, 0, 0},
436 {112, 98, 75, 43, 15, 1, 0},
437 {120, 112, 98, 76, 45, 16, 2} };
438
439/*
440 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
441 * would be when CPU is idle and so we just decay the old load without
442 * adding any new load.
443 */
444static unsigned long
445decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
446{
447 int j = 0;
448
449 if (!missed_updates)
450 return load;
451
452 if (missed_updates >= degrade_zero_ticks[idx])
453 return 0;
454
455 if (idx == 1)
456 return load >> missed_updates;
457
458 while (missed_updates) {
459 if (missed_updates % 2)
460 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
461
462 missed_updates >>= 1;
463 j++;
464 }
465 return load;
466}
467
468/*
469 * Update rq->cpu_load[] statistics. This function is usually called every
470 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
471 * every tick. We fix it up based on jiffies.
472 */
473static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
474 unsigned long pending_updates)
475{
476 int i, scale;
477
478 this_rq->nr_load_updates++;
479
480 /* Update our load: */
481 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
482 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
483 unsigned long old_load, new_load;
484
485 /* scale is effectively 1 << i now, and >> i divides by scale */
486
487 old_load = this_rq->cpu_load[i];
488 old_load = decay_load_missed(old_load, pending_updates - 1, i);
489 new_load = this_load;
490 /*
491 * Round up the averaging division if load is increasing. This
492 * prevents us from getting stuck on 9 if the load is 10, for
493 * example.
494 */
495 if (new_load > old_load)
496 new_load += scale - 1;
497
498 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
499 }
500
501 sched_avg_update(this_rq);
502}
503
504#ifdef CONFIG_SMP
505static inline unsigned long get_rq_runnable_load(struct rq *rq)
506{
507 return rq->cfs.runnable_load_avg;
508}
509#else
510static inline unsigned long get_rq_runnable_load(struct rq *rq)
511{
512 return rq->load.weight;
513}
514#endif
515
516#ifdef CONFIG_NO_HZ_COMMON
517/*
518 * There is no sane way to deal with nohz on smp when using jiffies because the
519 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
520 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
521 *
522 * Therefore we cannot use the delta approach from the regular tick since that
523 * would seriously skew the load calculation. However we'll make do for those
524 * updates happening while idle (nohz_idle_balance) or coming out of idle
525 * (tick_nohz_idle_exit).
526 *
527 * This means we might still be one tick off for nohz periods.
528 */
529
530/*
531 * Called from nohz_idle_balance() to update the load ratings before doing the
532 * idle balance.
533 */
534void update_idle_cpu_load(struct rq *this_rq)
535{
536 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
537 unsigned long load = get_rq_runnable_load(this_rq);
538 unsigned long pending_updates;
539
540 /*
541 * bail if there's load or we're actually up-to-date.
542 */
543 if (load || curr_jiffies == this_rq->last_load_update_tick)
544 return;
545
546 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
547 this_rq->last_load_update_tick = curr_jiffies;
548
549 __update_cpu_load(this_rq, load, pending_updates);
550}
551
552/*
553 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
554 */
555void update_cpu_load_nohz(void)
556{
557 struct rq *this_rq = this_rq();
558 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
559 unsigned long pending_updates;
560
561 if (curr_jiffies == this_rq->last_load_update_tick)
562 return;
563
564 raw_spin_lock(&this_rq->lock);
565 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
566 if (pending_updates) {
567 this_rq->last_load_update_tick = curr_jiffies;
568 /*
569 * We were idle, this means load 0, the current load might be
570 * !0 due to remote wakeups and the sort.
571 */
572 __update_cpu_load(this_rq, 0, pending_updates);
573 }
574 raw_spin_unlock(&this_rq->lock);
575}
576#endif /* CONFIG_NO_HZ */
577
578/*
579 * Called from scheduler_tick()
580 */
581void update_cpu_load_active(struct rq *this_rq)
582{
583 unsigned long load = get_rq_runnable_load(this_rq);
584 /*
585 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
586 */
587 this_rq->last_load_update_tick = jiffies;
588 __update_cpu_load(this_rq, load, 1);
589
590 calc_load_account_active(this_rq);
591}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4cf4ab..01970c8e64df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
399 (iter = next_task_group(iter)) && \ 399 (iter = next_task_group(iter)) && \
400 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 400 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
401 401
402static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
403{
404 list_add_rcu(&rt_rq->leaf_rt_rq_list,
405 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
406}
407
408static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
409{
410 list_del_rcu(&rt_rq->leaf_rt_rq_list);
411}
412
413#define for_each_leaf_rt_rq(rt_rq, rq) \
414 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
415
416#define for_each_sched_rt_entity(rt_se) \ 402#define for_each_sched_rt_entity(rt_se) \
417 for (; rt_se; rt_se = rt_se->parent) 403 for (; rt_se; rt_se = rt_se->parent)
418 404
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
472#ifdef CONFIG_SMP 458#ifdef CONFIG_SMP
473static inline const struct cpumask *sched_rt_period_mask(void) 459static inline const struct cpumask *sched_rt_period_mask(void)
474{ 460{
475 return cpu_rq(smp_processor_id())->rd->span; 461 return this_rq()->rd->span;
476} 462}
477#else 463#else
478static inline const struct cpumask *sched_rt_period_mask(void) 464static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t;
509#define for_each_rt_rq(rt_rq, iter, rq) \ 495#define for_each_rt_rq(rt_rq, iter, rq) \
510 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 496 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
511 497
512static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
513{
514}
515
516static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
517{
518}
519
520#define for_each_leaf_rt_rq(rt_rq, rq) \
521 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
522
523#define for_each_sched_rt_entity(rt_se) \ 498#define for_each_sched_rt_entity(rt_se) \
524 for (; rt_se; rt_se = NULL) 499 for (; rt_se; rt_se = NULL)
525 500
@@ -699,15 +674,6 @@ balanced:
699 } 674 }
700} 675}
701 676
702static void disable_runtime(struct rq *rq)
703{
704 unsigned long flags;
705
706 raw_spin_lock_irqsave(&rq->lock, flags);
707 __disable_runtime(rq);
708 raw_spin_unlock_irqrestore(&rq->lock, flags);
709}
710
711static void __enable_runtime(struct rq *rq) 677static void __enable_runtime(struct rq *rq)
712{ 678{
713 rt_rq_iter_t iter; 679 rt_rq_iter_t iter;
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq)
732 } 698 }
733} 699}
734 700
735static void enable_runtime(struct rq *rq)
736{
737 unsigned long flags;
738
739 raw_spin_lock_irqsave(&rq->lock, flags);
740 __enable_runtime(rq);
741 raw_spin_unlock_irqrestore(&rq->lock, flags);
742}
743
744int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
745{
746 int cpu = (int)(long)hcpu;
747
748 switch (action) {
749 case CPU_DOWN_PREPARE:
750 case CPU_DOWN_PREPARE_FROZEN:
751 disable_runtime(cpu_rq(cpu));
752 return NOTIFY_OK;
753
754 case CPU_DOWN_FAILED:
755 case CPU_DOWN_FAILED_FROZEN:
756 case CPU_ONLINE:
757 case CPU_ONLINE_FROZEN:
758 enable_runtime(cpu_rq(cpu));
759 return NOTIFY_OK;
760
761 default:
762 return NOTIFY_DONE;
763 }
764}
765
766static int balance_runtime(struct rt_rq *rt_rq) 701static int balance_runtime(struct rt_rq *rt_rq)
767{ 702{
768 int more = 0; 703 int more = 0;
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq)
926 if (curr->sched_class != &rt_sched_class) 861 if (curr->sched_class != &rt_sched_class)
927 return; 862 return;
928 863
929 delta_exec = rq->clock_task - curr->se.exec_start; 864 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
930 if (unlikely((s64)delta_exec <= 0)) 865 if (unlikely((s64)delta_exec <= 0))
931 return; 866 return;
932 867
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq)
936 curr->se.sum_exec_runtime += delta_exec; 871 curr->se.sum_exec_runtime += delta_exec;
937 account_group_exec_runtime(curr, delta_exec); 872 account_group_exec_runtime(curr, delta_exec);
938 873
939 curr->se.exec_start = rq->clock_task; 874 curr->se.exec_start = rq_clock_task(rq);
940 cpuacct_charge(curr, delta_exec); 875 cpuacct_charge(curr, delta_exec);
941 876
942 sched_rt_avg_update(rq, delta_exec); 877 sched_rt_avg_update(rq, delta_exec);
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1106 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1041 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1107 return; 1042 return;
1108 1043
1109 if (!rt_rq->rt_nr_running)
1110 list_add_leaf_rt_rq(rt_rq);
1111
1112 if (head) 1044 if (head)
1113 list_add(&rt_se->run_list, queue); 1045 list_add(&rt_se->run_list, queue);
1114 else 1046 else
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1128 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1060 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1129 1061
1130 dec_rt_tasks(rt_se, rt_rq); 1062 dec_rt_tasks(rt_se, rt_rq);
1131 if (!rt_rq->rt_nr_running)
1132 list_del_leaf_rt_rq(rt_rq);
1133} 1063}
1134 1064
1135/* 1065/*
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1385 } while (rt_rq); 1315 } while (rt_rq);
1386 1316
1387 p = rt_task_of(rt_se); 1317 p = rt_task_of(rt_se);
1388 p->se.exec_start = rq->clock_task; 1318 p->se.exec_start = rq_clock_task(rq);
1389 1319
1390 return p; 1320 return p;
1391} 1321}
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1434 return 0; 1364 return 0;
1435} 1365}
1436 1366
1437/* Return the second highest RT task, NULL otherwise */ 1367/*
1438static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 1368 * Return the highest pushable rq's task, which is suitable to be executed
1369 * on the cpu, NULL otherwise
1370 */
1371static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1439{ 1372{
1440 struct task_struct *next = NULL; 1373 struct plist_head *head = &rq->rt.pushable_tasks;
1441 struct sched_rt_entity *rt_se; 1374 struct task_struct *p;
1442 struct rt_prio_array *array;
1443 struct rt_rq *rt_rq;
1444 int idx;
1445
1446 for_each_leaf_rt_rq(rt_rq, rq) {
1447 array = &rt_rq->active;
1448 idx = sched_find_first_bit(array->bitmap);
1449next_idx:
1450 if (idx >= MAX_RT_PRIO)
1451 continue;
1452 if (next && next->prio <= idx)
1453 continue;
1454 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1455 struct task_struct *p;
1456 1375
1457 if (!rt_entity_is_task(rt_se)) 1376 if (!has_pushable_tasks(rq))
1458 continue; 1377 return NULL;
1459 1378
1460 p = rt_task_of(rt_se); 1379 plist_for_each_entry(p, head, pushable_tasks) {
1461 if (pick_rt_task(rq, p, cpu)) { 1380 if (pick_rt_task(rq, p, cpu))
1462 next = p; 1381 return p;
1463 break;
1464 }
1465 }
1466 if (!next) {
1467 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1468 goto next_idx;
1469 }
1470 } 1382 }
1471 1383
1472 return next; 1384 return NULL;
1473} 1385}
1474 1386
1475static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1387static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq)
1743 double_lock_balance(this_rq, src_rq); 1655 double_lock_balance(this_rq, src_rq);
1744 1656
1745 /* 1657 /*
1746 * Are there still pullable RT tasks? 1658 * We can pull only a task, which is pushable
1659 * on its rq, and no others.
1747 */ 1660 */
1748 if (src_rq->rt.rt_nr_running <= 1) 1661 p = pick_highest_pushable_task(src_rq, this_cpu);
1749 goto skip;
1750
1751 p = pick_next_highest_task_rt(src_rq, this_cpu);
1752 1662
1753 /* 1663 /*
1754 * Do we have an RT task that preempts 1664 * Do we have an RT task that preempts
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq)
2037{ 1947{
2038 struct task_struct *p = rq->curr; 1948 struct task_struct *p = rq->curr;
2039 1949
2040 p->se.exec_start = rq->clock_task; 1950 p->se.exec_start = rq_clock_task(rq);
2041 1951
2042 /* The running task is never eligible for pushing */ 1952 /* The running task is never eligible for pushing */
2043 dequeue_pushable_task(rq, p); 1953 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d6155..ef0a7b2439dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,8 +10,16 @@
10#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h" 11#include "cpuacct.h"
12 12
13struct rq;
14
13extern __read_mostly int scheduler_running; 15extern __read_mostly int scheduler_running;
14 16
17extern unsigned long calc_load_update;
18extern atomic_long_t calc_load_tasks;
19
20extern long calc_load_fold_active(struct rq *this_rq);
21extern void update_cpu_load_active(struct rq *this_rq);
22
15/* 23/*
16 * Convert user-nice values [ -20 ... 0 ... 19 ] 24 * Convert user-nice values [ -20 ... 0 ... 19 ]
17 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 25 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -140,10 +148,11 @@ struct task_group {
140 struct cfs_rq **cfs_rq; 148 struct cfs_rq **cfs_rq;
141 unsigned long shares; 149 unsigned long shares;
142 150
143 atomic_t load_weight; 151#ifdef CONFIG_SMP
144 atomic64_t load_avg; 152 atomic_long_t load_avg;
145 atomic_t runnable_avg; 153 atomic_t runnable_avg;
146#endif 154#endif
155#endif
147 156
148#ifdef CONFIG_RT_GROUP_SCHED 157#ifdef CONFIG_RT_GROUP_SCHED
149 struct sched_rt_entity **rt_se; 158 struct sched_rt_entity **rt_se;
@@ -261,26 +270,21 @@ struct cfs_rq {
261#endif 270#endif
262 271
263#ifdef CONFIG_SMP 272#ifdef CONFIG_SMP
264/*
265 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
266 * removed when useful for applications beyond shares distribution (e.g.
267 * load-balance).
268 */
269#ifdef CONFIG_FAIR_GROUP_SCHED
270 /* 273 /*
271 * CFS Load tracking 274 * CFS Load tracking
272 * Under CFS, load is tracked on a per-entity basis and aggregated up. 275 * Under CFS, load is tracked on a per-entity basis and aggregated up.
273 * This allows for the description of both thread and group usage (in 276 * This allows for the description of both thread and group usage (in
274 * the FAIR_GROUP_SCHED case). 277 * the FAIR_GROUP_SCHED case).
275 */ 278 */
276 u64 runnable_load_avg, blocked_load_avg; 279 unsigned long runnable_load_avg, blocked_load_avg;
277 atomic64_t decay_counter, removed_load; 280 atomic64_t decay_counter;
278 u64 last_decay; 281 u64 last_decay;
279#endif /* CONFIG_FAIR_GROUP_SCHED */ 282 atomic_long_t removed_load;
280/* These always depend on CONFIG_FAIR_GROUP_SCHED */ 283
281#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
285 /* Required to track per-cpu representation of a task_group */
282 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
283 u64 tg_load_contrib; 287 unsigned long tg_load_contrib;
284#endif /* CONFIG_FAIR_GROUP_SCHED */ 288#endif /* CONFIG_FAIR_GROUP_SCHED */
285 289
286 /* 290 /*
@@ -353,7 +357,6 @@ struct rt_rq {
353 unsigned long rt_nr_boosted; 357 unsigned long rt_nr_boosted;
354 358
355 struct rq *rq; 359 struct rq *rq;
356 struct list_head leaf_rt_rq_list;
357 struct task_group *tg; 360 struct task_group *tg;
358#endif 361#endif
359}; 362};
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues);
540#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 543#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
541#define raw_rq() (&__raw_get_cpu_var(runqueues)) 544#define raw_rq() (&__raw_get_cpu_var(runqueues))
542 545
546static inline u64 rq_clock(struct rq *rq)
547{
548 return rq->clock;
549}
550
551static inline u64 rq_clock_task(struct rq *rq)
552{
553 return rq->clock_task;
554}
555
543#ifdef CONFIG_SMP 556#ifdef CONFIG_SMP
544 557
545#define rcu_dereference_check_sched_domain(p) \ 558#define rcu_dereference_check_sched_domain(p) \
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
884#define WF_FORK 0x02 /* child wakeup after fork */ 897#define WF_FORK 0x02 /* child wakeup after fork */
885#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 898#define WF_MIGRATED 0x4 /* internal use, task got migrated */
886 899
887static inline void update_load_add(struct load_weight *lw, unsigned long inc)
888{
889 lw->weight += inc;
890 lw->inv_weight = 0;
891}
892
893static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
894{
895 lw->weight -= dec;
896 lw->inv_weight = 0;
897}
898
899static inline void update_load_set(struct load_weight *lw, unsigned long w)
900{
901 lw->weight = w;
902 lw->inv_weight = 0;
903}
904
905/* 900/*
906 * To aid in avoiding the subversion of "niceness" due to uneven distribution 901 * To aid in avoiding the subversion of "niceness" due to uneven distribution
907 * of tasks with abnormal "nice" values across CPUs the contribution that 902 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
1028extern void trigger_load_balance(struct rq *rq, int cpu); 1023extern void trigger_load_balance(struct rq *rq, int cpu);
1029extern void idle_balance(int this_cpu, struct rq *this_rq); 1024extern void idle_balance(int this_cpu, struct rq *this_rq);
1030 1025
1031/*
1032 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1033 * becomes useful in lb
1034 */
1035#if defined(CONFIG_FAIR_GROUP_SCHED)
1036extern void idle_enter_fair(struct rq *this_rq); 1026extern void idle_enter_fair(struct rq *this_rq);
1037extern void idle_exit_fair(struct rq *this_rq); 1027extern void idle_exit_fair(struct rq *this_rq);
1038#else
1039static inline void idle_enter_fair(struct rq *this_rq) {}
1040static inline void idle_exit_fair(struct rq *this_rq) {}
1041#endif
1042 1028
1043#else /* CONFIG_SMP */ 1029#else /* CONFIG_SMP */
1044 1030
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
1051extern void sysrq_sched_debug_show(void); 1037extern void sysrq_sched_debug_show(void);
1052extern void sched_init_granularity(void); 1038extern void sched_init_granularity(void);
1053extern void update_max_interval(void); 1039extern void update_max_interval(void);
1054extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
1055extern void init_sched_rt_class(void); 1040extern void init_sched_rt_class(void);
1056extern void init_sched_fair_class(void); 1041extern void init_sched_fair_class(void);
1057 1042
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
1063 1048
1064extern void update_idle_cpu_load(struct rq *this_rq); 1049extern void update_idle_cpu_load(struct rq *this_rq);
1065 1050
1051extern void init_task_runnable_average(struct task_struct *p);
1052
1066#ifdef CONFIG_PARAVIRT 1053#ifdef CONFIG_PARAVIRT
1067static inline u64 steal_ticks(u64 steal) 1054static inline u64 steal_ticks(u64 steal)
1068{ 1055{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..5aef494fc8b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct task_struct *t)
63{ 63{
64 unsigned long long now = task_rq(t)->clock, delta = 0; 64 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct task_struct *t)
81{ 81{
82 unsigned long long now = task_rq(t)->clock, delta = 0; 82 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = task_rq(t)->clock; 103 t->sched_info.last_queued = rq_clock(task_rq(t));
104} 104}
105 105
106/* 106/*
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t)
112 */ 112 */
113static inline void sched_info_depart(struct task_struct *t) 113static inline void sched_info_depart(struct task_struct *t)
114{ 114{
115 unsigned long long delta = task_rq(t)->clock - 115 unsigned long long delta = rq_clock(task_rq(t)) -
116 t->sched_info.last_arrival; 116 t->sched_info.last_arrival;
117 117
118 rq_sched_info_depart(task_rq(t), delta); 118 rq_sched_info_depart(task_rq(t), delta);
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
162 */ 162 */
163 163
164/** 164/**
165 * cputimer_running - return true if cputimer is running
166 *
167 * @tsk: Pointer to target task.
168 */
169static inline bool cputimer_running(struct task_struct *tsk)
170
171{
172 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
173
174 if (!cputimer->running)
175 return false;
176
177 /*
178 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
179 * in __exit_signal(), we won't account to the signal struct further
180 * cputime consumed by that task, even though the task can still be
181 * ticking after __exit_signal().
182 *
183 * In order to keep a consistent behaviour between thread group cputime
184 * and thread group cputimer accounting, lets also ignore the cputime
185 * elapsing after __exit_signal() in any thread group timer running.
186 *
187 * This makes sure that POSIX CPU clocks and timers are synchronized, so
188 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
189 * clock delta is behind the expiring timer value.
190 */
191 if (unlikely(!tsk->sighand))
192 return false;
193
194 return true;
195}
196
197/**
165 * account_group_user_time - Maintain utime for a thread group. 198 * account_group_user_time - Maintain utime for a thread group.
166 * 199 *
167 * @tsk: Pointer to task structure. 200 * @tsk: Pointer to task structure.
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
176{ 209{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 210 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178 211
179 if (!cputimer->running) 212 if (!cputimer_running(tsk))
180 return; 213 return;
181 214
182 raw_spin_lock(&cputimer->lock); 215 raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
199{ 232{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 233 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201 234
202 if (!cputimer->running) 235 if (!cputimer_running(tsk))
203 return; 236 return;
204 237
205 raw_spin_lock(&cputimer->lock); 238 raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
222{ 255{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 256 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224 257
225 if (!cputimer->running) 258 if (!cputimer_running(tsk))
226 return; 259 return;
227 260
228 raw_spin_lock(&cputimer->lock); 261 raw_spin_lock(&cputimer->lock);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5bed84a..e08fbeeb54b9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) { 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task; 31 stop->se.exec_start = rq_clock_task(rq);
32 return stop; 32 return stop;
33 } 33 }
34 34
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
57 struct task_struct *curr = rq->curr; 57 struct task_struct *curr = rq->curr;
58 u64 delta_exec; 58 u64 delta_exec;
59 59
60 delta_exec = rq->clock_task - curr->se.exec_start; 60 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0)) 61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0; 62 delta_exec = 0;
63 63
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
67 curr->se.sum_exec_runtime += delta_exec; 67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec); 68 account_group_exec_runtime(curr, delta_exec);
69 69
70 curr->se.exec_start = rq->clock_task; 70 curr->se.exec_start = rq_clock_task(rq);
71 cpuacct_charge(curr, delta_exec); 71 cpuacct_charge(curr, delta_exec);
72} 72}
73 73
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)
79{ 79{
80 struct task_struct *stop = rq->stop; 80 struct task_struct *stop = rq->stop;
81 81
82 stop->se.exec_start = rq->clock_task; 82 stop->se.exec_start = rq_clock_task(rq);
83} 83}
84 84
85static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)