aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c765
1 files changed, 124 insertions, 641 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b335016c52..05c39f030314 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)
370#ifdef CONFIG_SCHED_HRTICK 370#ifdef CONFIG_SCHED_HRTICK
371/* 371/*
372 * Use HR-timers to deliver accurate preemption points. 372 * Use HR-timers to deliver accurate preemption points.
373 *
374 * Its all a bit involved since we cannot program an hrt while holding the
375 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
376 * reschedule event.
377 *
378 * When we get rescheduled we reprogram the hrtick_timer outside of the
379 * rq->lock.
380 */ 373 */
381 374
382static void hrtick_clear(struct rq *rq) 375static void hrtick_clear(struct rq *rq)
@@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
404} 397}
405 398
406#ifdef CONFIG_SMP 399#ifdef CONFIG_SMP
400
401static int __hrtick_restart(struct rq *rq)
402{
403 struct hrtimer *timer = &rq->hrtick_timer;
404 ktime_t time = hrtimer_get_softexpires(timer);
405
406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
407}
408
407/* 409/*
408 * called from hardirq (IPI) context 410 * called from hardirq (IPI) context
409 */ 411 */
@@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)
412 struct rq *rq = arg; 414 struct rq *rq = arg;
413 415
414 raw_spin_lock(&rq->lock); 416 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer); 417 __hrtick_restart(rq);
416 rq->hrtick_csd_pending = 0; 418 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock); 419 raw_spin_unlock(&rq->lock);
418} 420}
@@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
430 hrtimer_set_expires(timer, time); 432 hrtimer_set_expires(timer, time);
431 433
432 if (rq == this_rq()) { 434 if (rq == this_rq()) {
433 hrtimer_restart(timer); 435 __hrtick_restart(rq);
434 } else if (!rq->hrtick_csd_pending) { 436 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1; 438 rq->hrtick_csd_pending = 1;
@@ -679,7 +681,7 @@ void sched_avg_update(struct rq *rq)
679{ 681{
680 s64 period = sched_avg_period(); 682 s64 period = sched_avg_period();
681 683
682 while ((s64)(rq->clock - rq->age_stamp) > period) { 684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
683 /* 685 /*
684 * Inline assembly required to prevent the compiler 686 * Inline assembly required to prevent the compiler
685 * optimising this loop into a divmod call. 687 * optimising this loop into a divmod call.
@@ -931,6 +933,8 @@ static int effective_prio(struct task_struct *p)
931/** 933/**
932 * task_curr - is this task currently executing on a CPU? 934 * task_curr - is this task currently executing on a CPU?
933 * @p: the task in question. 935 * @p: the task in question.
936 *
937 * Return: 1 if the task is currently executing. 0 otherwise.
934 */ 938 */
935inline int task_curr(const struct task_struct *p) 939inline int task_curr(const struct task_struct *p)
936{ 940{
@@ -1340,7 +1344,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1340 p->sched_class->task_woken(rq, p); 1344 p->sched_class->task_woken(rq, p);
1341 1345
1342 if (rq->idle_stamp) { 1346 if (rq->idle_stamp) {
1343 u64 delta = rq->clock - rq->idle_stamp; 1347 u64 delta = rq_clock(rq) - rq->idle_stamp;
1344 u64 max = 2*sysctl_sched_migration_cost; 1348 u64 max = 2*sysctl_sched_migration_cost;
1345 1349
1346 if (delta > max) 1350 if (delta > max)
@@ -1377,6 +1381,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1377 1381
1378 rq = __task_rq_lock(p); 1382 rq = __task_rq_lock(p);
1379 if (p->on_rq) { 1383 if (p->on_rq) {
1384 /* check_preempt_curr() may use rq clock */
1385 update_rq_clock(rq);
1380 ttwu_do_wakeup(rq, p, wake_flags); 1386 ttwu_do_wakeup(rq, p, wake_flags);
1381 ret = 1; 1387 ret = 1;
1382 } 1388 }
@@ -1478,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1478 * the simpler "current->state = TASK_RUNNING" to mark yourself 1484 * the simpler "current->state = TASK_RUNNING" to mark yourself
1479 * runnable without the overhead of this. 1485 * runnable without the overhead of this.
1480 * 1486 *
1481 * Returns %true if @p was woken up, %false if it was already running 1487 * Return: %true if @p was woken up, %false if it was already running.
1482 * or @state didn't match @p's state. 1488 * or @state didn't match @p's state.
1483 */ 1489 */
1484static int 1490static int
@@ -1487,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1487 unsigned long flags; 1493 unsigned long flags;
1488 int cpu, success = 0; 1494 int cpu, success = 0;
1489 1495
1490 smp_wmb(); 1496 /*
1497 * If we are going to wake up a thread waiting for CONDITION we
1498 * need to ensure that CONDITION=1 done by the caller can not be
1499 * reordered with p->state check below. This pairs with mb() in
1500 * set_current_state() the waiting thread does.
1501 */
1502 smp_mb__before_spinlock();
1491 raw_spin_lock_irqsave(&p->pi_lock, flags); 1503 raw_spin_lock_irqsave(&p->pi_lock, flags);
1492 if (!(p->state & state)) 1504 if (!(p->state & state))
1493 goto out; 1505 goto out;
@@ -1573,8 +1585,9 @@ out:
1573 * @p: The process to be woken up. 1585 * @p: The process to be woken up.
1574 * 1586 *
1575 * Attempt to wake up the nominated process and move it to the set of runnable 1587 * Attempt to wake up the nominated process and move it to the set of runnable
1576 * processes. Returns 1 if the process was woken up, 0 if it was already 1588 * processes.
1577 * running. 1589 *
1590 * Return: 1 if the process was woken up, 0 if it was already running.
1578 * 1591 *
1579 * It may be assumed that this function implies a write memory barrier before 1592 * It may be assumed that this function implies a write memory barrier before
1580 * changing the task state if and only if any tasks are woken up. 1593 * changing the task state if and only if any tasks are woken up.
@@ -1609,15 +1622,6 @@ static void __sched_fork(struct task_struct *p)
1609 p->se.vruntime = 0; 1622 p->se.vruntime = 0;
1610 INIT_LIST_HEAD(&p->se.group_node); 1623 INIT_LIST_HEAD(&p->se.group_node);
1611 1624
1612/*
1613 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1614 * removed when useful for applications beyond shares distribution (e.g.
1615 * load-balance).
1616 */
1617#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1618 p->se.avg.runnable_avg_period = 0;
1619 p->se.avg.runnable_avg_sum = 0;
1620#endif
1621#ifdef CONFIG_SCHEDSTATS 1625#ifdef CONFIG_SCHEDSTATS
1622 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1626 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1623#endif 1627#endif
@@ -1761,6 +1765,8 @@ void wake_up_new_task(struct task_struct *p)
1761 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1765 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1762#endif 1766#endif
1763 1767
1768 /* Initialize new task's runnable average */
1769 init_task_runnable_average(p);
1764 rq = __task_rq_lock(p); 1770 rq = __task_rq_lock(p);
1765 activate_task(rq, p, 0); 1771 activate_task(rq, p, 0);
1766 p->on_rq = 1; 1772 p->on_rq = 1;
@@ -2069,575 +2075,6 @@ unsigned long nr_iowait_cpu(int cpu)
2069 return atomic_read(&this->nr_iowait); 2075 return atomic_read(&this->nr_iowait);
2070} 2076}
2071 2077
2072unsigned long this_cpu_load(void)
2073{
2074 struct rq *this = this_rq();
2075 return this->cpu_load[0];
2076}
2077
2078
2079/*
2080 * Global load-average calculations
2081 *
2082 * We take a distributed and async approach to calculating the global load-avg
2083 * in order to minimize overhead.
2084 *
2085 * The global load average is an exponentially decaying average of nr_running +
2086 * nr_uninterruptible.
2087 *
2088 * Once every LOAD_FREQ:
2089 *
2090 * nr_active = 0;
2091 * for_each_possible_cpu(cpu)
2092 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2093 *
2094 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2095 *
2096 * Due to a number of reasons the above turns in the mess below:
2097 *
2098 * - for_each_possible_cpu() is prohibitively expensive on machines with
2099 * serious number of cpus, therefore we need to take a distributed approach
2100 * to calculating nr_active.
2101 *
2102 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2103 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2104 *
2105 * So assuming nr_active := 0 when we start out -- true per definition, we
2106 * can simply take per-cpu deltas and fold those into a global accumulate
2107 * to obtain the same result. See calc_load_fold_active().
2108 *
2109 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2110 * across the machine, we assume 10 ticks is sufficient time for every
2111 * cpu to have completed this task.
2112 *
2113 * This places an upper-bound on the IRQ-off latency of the machine. Then
2114 * again, being late doesn't loose the delta, just wrecks the sample.
2115 *
2116 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2117 * this would add another cross-cpu cacheline miss and atomic operation
2118 * to the wakeup path. Instead we increment on whatever cpu the task ran
2119 * when it went into uninterruptible state and decrement on whatever cpu
2120 * did the wakeup. This means that only the sum of nr_uninterruptible over
2121 * all cpus yields the correct result.
2122 *
2123 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2124 */
2125
2126/* Variables and functions for calc_load */
2127static atomic_long_t calc_load_tasks;
2128static unsigned long calc_load_update;
2129unsigned long avenrun[3];
2130EXPORT_SYMBOL(avenrun); /* should be removed */
2131
2132/**
2133 * get_avenrun - get the load average array
2134 * @loads: pointer to dest load array
2135 * @offset: offset to add
2136 * @shift: shift count to shift the result left
2137 *
2138 * These values are estimates at best, so no need for locking.
2139 */
2140void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2141{
2142 loads[0] = (avenrun[0] + offset) << shift;
2143 loads[1] = (avenrun[1] + offset) << shift;
2144 loads[2] = (avenrun[2] + offset) << shift;
2145}
2146
2147static long calc_load_fold_active(struct rq *this_rq)
2148{
2149 long nr_active, delta = 0;
2150
2151 nr_active = this_rq->nr_running;
2152 nr_active += (long) this_rq->nr_uninterruptible;
2153
2154 if (nr_active != this_rq->calc_load_active) {
2155 delta = nr_active - this_rq->calc_load_active;
2156 this_rq->calc_load_active = nr_active;
2157 }
2158
2159 return delta;
2160}
2161
2162/*
2163 * a1 = a0 * e + a * (1 - e)
2164 */
2165static unsigned long
2166calc_load(unsigned long load, unsigned long exp, unsigned long active)
2167{
2168 load *= exp;
2169 load += active * (FIXED_1 - exp);
2170 load += 1UL << (FSHIFT - 1);
2171 return load >> FSHIFT;
2172}
2173
2174#ifdef CONFIG_NO_HZ_COMMON
2175/*
2176 * Handle NO_HZ for the global load-average.
2177 *
2178 * Since the above described distributed algorithm to compute the global
2179 * load-average relies on per-cpu sampling from the tick, it is affected by
2180 * NO_HZ.
2181 *
2182 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2183 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2184 * when we read the global state.
2185 *
2186 * Obviously reality has to ruin such a delightfully simple scheme:
2187 *
2188 * - When we go NO_HZ idle during the window, we can negate our sample
2189 * contribution, causing under-accounting.
2190 *
2191 * We avoid this by keeping two idle-delta counters and flipping them
2192 * when the window starts, thus separating old and new NO_HZ load.
2193 *
2194 * The only trick is the slight shift in index flip for read vs write.
2195 *
2196 * 0s 5s 10s 15s
2197 * +10 +10 +10 +10
2198 * |-|-----------|-|-----------|-|-----------|-|
2199 * r:0 0 1 1 0 0 1 1 0
2200 * w:0 1 1 0 0 1 1 0 0
2201 *
2202 * This ensures we'll fold the old idle contribution in this window while
2203 * accumlating the new one.
2204 *
2205 * - When we wake up from NO_HZ idle during the window, we push up our
2206 * contribution, since we effectively move our sample point to a known
2207 * busy state.
2208 *
2209 * This is solved by pushing the window forward, and thus skipping the
2210 * sample, for this cpu (effectively using the idle-delta for this cpu which
2211 * was in effect at the time the window opened). This also solves the issue
2212 * of having to deal with a cpu having been in NOHZ idle for multiple
2213 * LOAD_FREQ intervals.
2214 *
2215 * When making the ILB scale, we should try to pull this in as well.
2216 */
2217static atomic_long_t calc_load_idle[2];
2218static int calc_load_idx;
2219
2220static inline int calc_load_write_idx(void)
2221{
2222 int idx = calc_load_idx;
2223
2224 /*
2225 * See calc_global_nohz(), if we observe the new index, we also
2226 * need to observe the new update time.
2227 */
2228 smp_rmb();
2229
2230 /*
2231 * If the folding window started, make sure we start writing in the
2232 * next idle-delta.
2233 */
2234 if (!time_before(jiffies, calc_load_update))
2235 idx++;
2236
2237 return idx & 1;
2238}
2239
2240static inline int calc_load_read_idx(void)
2241{
2242 return calc_load_idx & 1;
2243}
2244
2245void calc_load_enter_idle(void)
2246{
2247 struct rq *this_rq = this_rq();
2248 long delta;
2249
2250 /*
2251 * We're going into NOHZ mode, if there's any pending delta, fold it
2252 * into the pending idle delta.
2253 */
2254 delta = calc_load_fold_active(this_rq);
2255 if (delta) {
2256 int idx = calc_load_write_idx();
2257 atomic_long_add(delta, &calc_load_idle[idx]);
2258 }
2259}
2260
2261void calc_load_exit_idle(void)
2262{
2263 struct rq *this_rq = this_rq();
2264
2265 /*
2266 * If we're still before the sample window, we're done.
2267 */
2268 if (time_before(jiffies, this_rq->calc_load_update))
2269 return;
2270
2271 /*
2272 * We woke inside or after the sample window, this means we're already
2273 * accounted through the nohz accounting, so skip the entire deal and
2274 * sync up for the next window.
2275 */
2276 this_rq->calc_load_update = calc_load_update;
2277 if (time_before(jiffies, this_rq->calc_load_update + 10))
2278 this_rq->calc_load_update += LOAD_FREQ;
2279}
2280
2281static long calc_load_fold_idle(void)
2282{
2283 int idx = calc_load_read_idx();
2284 long delta = 0;
2285
2286 if (atomic_long_read(&calc_load_idle[idx]))
2287 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2288
2289 return delta;
2290}
2291
2292/**
2293 * fixed_power_int - compute: x^n, in O(log n) time
2294 *
2295 * @x: base of the power
2296 * @frac_bits: fractional bits of @x
2297 * @n: power to raise @x to.
2298 *
2299 * By exploiting the relation between the definition of the natural power
2300 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2301 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2302 * (where: n_i \elem {0, 1}, the binary vector representing n),
2303 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2304 * of course trivially computable in O(log_2 n), the length of our binary
2305 * vector.
2306 */
2307static unsigned long
2308fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2309{
2310 unsigned long result = 1UL << frac_bits;
2311
2312 if (n) for (;;) {
2313 if (n & 1) {
2314 result *= x;
2315 result += 1UL << (frac_bits - 1);
2316 result >>= frac_bits;
2317 }
2318 n >>= 1;
2319 if (!n)
2320 break;
2321 x *= x;
2322 x += 1UL << (frac_bits - 1);
2323 x >>= frac_bits;
2324 }
2325
2326 return result;
2327}
2328
2329/*
2330 * a1 = a0 * e + a * (1 - e)
2331 *
2332 * a2 = a1 * e + a * (1 - e)
2333 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2334 * = a0 * e^2 + a * (1 - e) * (1 + e)
2335 *
2336 * a3 = a2 * e + a * (1 - e)
2337 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2338 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2339 *
2340 * ...
2341 *
2342 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2343 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2344 * = a0 * e^n + a * (1 - e^n)
2345 *
2346 * [1] application of the geometric series:
2347 *
2348 * n 1 - x^(n+1)
2349 * S_n := \Sum x^i = -------------
2350 * i=0 1 - x
2351 */
2352static unsigned long
2353calc_load_n(unsigned long load, unsigned long exp,
2354 unsigned long active, unsigned int n)
2355{
2356
2357 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2358}
2359
2360/*
2361 * NO_HZ can leave us missing all per-cpu ticks calling
2362 * calc_load_account_active(), but since an idle CPU folds its delta into
2363 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2364 * in the pending idle delta if our idle period crossed a load cycle boundary.
2365 *
2366 * Once we've updated the global active value, we need to apply the exponential
2367 * weights adjusted to the number of cycles missed.
2368 */
2369static void calc_global_nohz(void)
2370{
2371 long delta, active, n;
2372
2373 if (!time_before(jiffies, calc_load_update + 10)) {
2374 /*
2375 * Catch-up, fold however many we are behind still
2376 */
2377 delta = jiffies - calc_load_update - 10;
2378 n = 1 + (delta / LOAD_FREQ);
2379
2380 active = atomic_long_read(&calc_load_tasks);
2381 active = active > 0 ? active * FIXED_1 : 0;
2382
2383 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2384 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2385 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2386
2387 calc_load_update += n * LOAD_FREQ;
2388 }
2389
2390 /*
2391 * Flip the idle index...
2392 *
2393 * Make sure we first write the new time then flip the index, so that
2394 * calc_load_write_idx() will see the new time when it reads the new
2395 * index, this avoids a double flip messing things up.
2396 */
2397 smp_wmb();
2398 calc_load_idx++;
2399}
2400#else /* !CONFIG_NO_HZ_COMMON */
2401
2402static inline long calc_load_fold_idle(void) { return 0; }
2403static inline void calc_global_nohz(void) { }
2404
2405#endif /* CONFIG_NO_HZ_COMMON */
2406
2407/*
2408 * calc_load - update the avenrun load estimates 10 ticks after the
2409 * CPUs have updated calc_load_tasks.
2410 */
2411void calc_global_load(unsigned long ticks)
2412{
2413 long active, delta;
2414
2415 if (time_before(jiffies, calc_load_update + 10))
2416 return;
2417
2418 /*
2419 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2420 */
2421 delta = calc_load_fold_idle();
2422 if (delta)
2423 atomic_long_add(delta, &calc_load_tasks);
2424
2425 active = atomic_long_read(&calc_load_tasks);
2426 active = active > 0 ? active * FIXED_1 : 0;
2427
2428 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2429 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2430 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2431
2432 calc_load_update += LOAD_FREQ;
2433
2434 /*
2435 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2436 */
2437 calc_global_nohz();
2438}
2439
2440/*
2441 * Called from update_cpu_load() to periodically update this CPU's
2442 * active count.
2443 */
2444static void calc_load_account_active(struct rq *this_rq)
2445{
2446 long delta;
2447
2448 if (time_before(jiffies, this_rq->calc_load_update))
2449 return;
2450
2451 delta = calc_load_fold_active(this_rq);
2452 if (delta)
2453 atomic_long_add(delta, &calc_load_tasks);
2454
2455 this_rq->calc_load_update += LOAD_FREQ;
2456}
2457
2458/*
2459 * End of global load-average stuff
2460 */
2461
2462/*
2463 * The exact cpuload at various idx values, calculated at every tick would be
2464 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2465 *
2466 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2467 * on nth tick when cpu may be busy, then we have:
2468 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2469 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2470 *
2471 * decay_load_missed() below does efficient calculation of
2472 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2473 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2474 *
2475 * The calculation is approximated on a 128 point scale.
2476 * degrade_zero_ticks is the number of ticks after which load at any
2477 * particular idx is approximated to be zero.
2478 * degrade_factor is a precomputed table, a row for each load idx.
2479 * Each column corresponds to degradation factor for a power of two ticks,
2480 * based on 128 point scale.
2481 * Example:
2482 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2483 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2484 *
2485 * With this power of 2 load factors, we can degrade the load n times
2486 * by looking at 1 bits in n and doing as many mult/shift instead of
2487 * n mult/shifts needed by the exact degradation.
2488 */
2489#define DEGRADE_SHIFT 7
2490static const unsigned char
2491 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2492static const unsigned char
2493 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2494 {0, 0, 0, 0, 0, 0, 0, 0},
2495 {64, 32, 8, 0, 0, 0, 0, 0},
2496 {96, 72, 40, 12, 1, 0, 0},
2497 {112, 98, 75, 43, 15, 1, 0},
2498 {120, 112, 98, 76, 45, 16, 2} };
2499
2500/*
2501 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2502 * would be when CPU is idle and so we just decay the old load without
2503 * adding any new load.
2504 */
2505static unsigned long
2506decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2507{
2508 int j = 0;
2509
2510 if (!missed_updates)
2511 return load;
2512
2513 if (missed_updates >= degrade_zero_ticks[idx])
2514 return 0;
2515
2516 if (idx == 1)
2517 return load >> missed_updates;
2518
2519 while (missed_updates) {
2520 if (missed_updates % 2)
2521 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2522
2523 missed_updates >>= 1;
2524 j++;
2525 }
2526 return load;
2527}
2528
2529/*
2530 * Update rq->cpu_load[] statistics. This function is usually called every
2531 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2532 * every tick. We fix it up based on jiffies.
2533 */
2534static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2535 unsigned long pending_updates)
2536{
2537 int i, scale;
2538
2539 this_rq->nr_load_updates++;
2540
2541 /* Update our load: */
2542 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2543 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2544 unsigned long old_load, new_load;
2545
2546 /* scale is effectively 1 << i now, and >> i divides by scale */
2547
2548 old_load = this_rq->cpu_load[i];
2549 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2550 new_load = this_load;
2551 /*
2552 * Round up the averaging division if load is increasing. This
2553 * prevents us from getting stuck on 9 if the load is 10, for
2554 * example.
2555 */
2556 if (new_load > old_load)
2557 new_load += scale - 1;
2558
2559 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2560 }
2561
2562 sched_avg_update(this_rq);
2563}
2564
2565#ifdef CONFIG_NO_HZ_COMMON
2566/*
2567 * There is no sane way to deal with nohz on smp when using jiffies because the
2568 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2569 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2570 *
2571 * Therefore we cannot use the delta approach from the regular tick since that
2572 * would seriously skew the load calculation. However we'll make do for those
2573 * updates happening while idle (nohz_idle_balance) or coming out of idle
2574 * (tick_nohz_idle_exit).
2575 *
2576 * This means we might still be one tick off for nohz periods.
2577 */
2578
2579/*
2580 * Called from nohz_idle_balance() to update the load ratings before doing the
2581 * idle balance.
2582 */
2583void update_idle_cpu_load(struct rq *this_rq)
2584{
2585 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2586 unsigned long load = this_rq->load.weight;
2587 unsigned long pending_updates;
2588
2589 /*
2590 * bail if there's load or we're actually up-to-date.
2591 */
2592 if (load || curr_jiffies == this_rq->last_load_update_tick)
2593 return;
2594
2595 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2596 this_rq->last_load_update_tick = curr_jiffies;
2597
2598 __update_cpu_load(this_rq, load, pending_updates);
2599}
2600
2601/*
2602 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2603 */
2604void update_cpu_load_nohz(void)
2605{
2606 struct rq *this_rq = this_rq();
2607 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2608 unsigned long pending_updates;
2609
2610 if (curr_jiffies == this_rq->last_load_update_tick)
2611 return;
2612
2613 raw_spin_lock(&this_rq->lock);
2614 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2615 if (pending_updates) {
2616 this_rq->last_load_update_tick = curr_jiffies;
2617 /*
2618 * We were idle, this means load 0, the current load might be
2619 * !0 due to remote wakeups and the sort.
2620 */
2621 __update_cpu_load(this_rq, 0, pending_updates);
2622 }
2623 raw_spin_unlock(&this_rq->lock);
2624}
2625#endif /* CONFIG_NO_HZ_COMMON */
2626
2627/*
2628 * Called from scheduler_tick()
2629 */
2630static void update_cpu_load_active(struct rq *this_rq)
2631{
2632 /*
2633 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2634 */
2635 this_rq->last_load_update_tick = jiffies;
2636 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2637
2638 calc_load_account_active(this_rq);
2639}
2640
2641#ifdef CONFIG_SMP 2078#ifdef CONFIG_SMP
2642 2079
2643/* 2080/*
@@ -2686,7 +2123,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2686 2123
2687 if (task_current(rq, p)) { 2124 if (task_current(rq, p)) {
2688 update_rq_clock(rq); 2125 update_rq_clock(rq);
2689 ns = rq->clock_task - p->se.exec_start; 2126 ns = rq_clock_task(rq) - p->se.exec_start;
2690 if ((s64)ns < 0) 2127 if ((s64)ns < 0)
2691 ns = 0; 2128 ns = 0;
2692 } 2129 }
@@ -2739,8 +2176,8 @@ void scheduler_tick(void)
2739 2176
2740 raw_spin_lock(&rq->lock); 2177 raw_spin_lock(&rq->lock);
2741 update_rq_clock(rq); 2178 update_rq_clock(rq);
2742 update_cpu_load_active(rq);
2743 curr->sched_class->task_tick(rq, curr, 0); 2179 curr->sched_class->task_tick(rq, curr, 0);
2180 update_cpu_load_active(rq);
2744 raw_spin_unlock(&rq->lock); 2181 raw_spin_unlock(&rq->lock);
2745 2182
2746 perf_event_task_tick(); 2183 perf_event_task_tick();
@@ -2763,6 +2200,8 @@ void scheduler_tick(void)
2763 * This makes sure that uptime, CFS vruntime, load 2200 * This makes sure that uptime, CFS vruntime, load
2764 * balancing, etc... continue to move forward, even 2201 * balancing, etc... continue to move forward, even
2765 * with a very low granularity. 2202 * with a very low granularity.
2203 *
2204 * Return: Maximum deferment in nanoseconds.
2766 */ 2205 */
2767u64 scheduler_tick_max_deferment(void) 2206u64 scheduler_tick_max_deferment(void)
2768{ 2207{
@@ -2966,6 +2405,12 @@ need_resched:
2966 if (sched_feat(HRTICK)) 2405 if (sched_feat(HRTICK))
2967 hrtick_clear(rq); 2406 hrtick_clear(rq);
2968 2407
2408 /*
2409 * Make sure that signal_pending_state()->signal_pending() below
2410 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2411 * done by the caller to avoid the race with signal_wake_up().
2412 */
2413 smp_mb__before_spinlock();
2969 raw_spin_lock_irq(&rq->lock); 2414 raw_spin_lock_irq(&rq->lock);
2970 2415
2971 switch_count = &prev->nivcsw; 2416 switch_count = &prev->nivcsw;
@@ -3368,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion);
3368 * specified timeout to expire. The timeout is in jiffies. It is not 2813 * specified timeout to expire. The timeout is in jiffies. It is not
3369 * interruptible. 2814 * interruptible.
3370 * 2815 *
3371 * The return value is 0 if timed out, and positive (at least 1, or number of 2816 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
3372 * jiffies left till timeout) if completed. 2817 * till timeout) if completed.
3373 */ 2818 */
3374unsigned long __sched 2819unsigned long __sched
3375wait_for_completion_timeout(struct completion *x, unsigned long timeout) 2820wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -3401,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
3401 * specified timeout to expire. The timeout is in jiffies. It is not 2846 * specified timeout to expire. The timeout is in jiffies. It is not
3402 * interruptible. The caller is accounted as waiting for IO. 2847 * interruptible. The caller is accounted as waiting for IO.
3403 * 2848 *
3404 * The return value is 0 if timed out, and positive (at least 1, or number of 2849 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
3405 * jiffies left till timeout) if completed. 2850 * till timeout) if completed.
3406 */ 2851 */
3407unsigned long __sched 2852unsigned long __sched
3408wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) 2853wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -3418,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
3418 * This waits for completion of a specific task to be signaled. It is 2863 * This waits for completion of a specific task to be signaled. It is
3419 * interruptible. 2864 * interruptible.
3420 * 2865 *
3421 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 2866 * Return: -ERESTARTSYS if interrupted, 0 if completed.
3422 */ 2867 */
3423int __sched wait_for_completion_interruptible(struct completion *x) 2868int __sched wait_for_completion_interruptible(struct completion *x)
3424{ 2869{
@@ -3437,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
3437 * This waits for either a completion of a specific task to be signaled or for a 2882 * This waits for either a completion of a specific task to be signaled or for a
3438 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 2883 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3439 * 2884 *
3440 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 2885 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
3441 * positive (at least 1, or number of jiffies left till timeout) if completed. 2886 * or number of jiffies left till timeout) if completed.
3442 */ 2887 */
3443long __sched 2888long __sched
3444wait_for_completion_interruptible_timeout(struct completion *x, 2889wait_for_completion_interruptible_timeout(struct completion *x,
@@ -3455,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3455 * This waits to be signaled for completion of a specific task. It can be 2900 * This waits to be signaled for completion of a specific task. It can be
3456 * interrupted by a kill signal. 2901 * interrupted by a kill signal.
3457 * 2902 *
3458 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 2903 * Return: -ERESTARTSYS if interrupted, 0 if completed.
3459 */ 2904 */
3460int __sched wait_for_completion_killable(struct completion *x) 2905int __sched wait_for_completion_killable(struct completion *x)
3461{ 2906{
@@ -3475,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
3475 * signaled or for a specified timeout to expire. It can be 2920 * signaled or for a specified timeout to expire. It can be
3476 * interrupted by a kill signal. The timeout is in jiffies. 2921 * interrupted by a kill signal. The timeout is in jiffies.
3477 * 2922 *
3478 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 2923 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
3479 * positive (at least 1, or number of jiffies left till timeout) if completed. 2924 * or number of jiffies left till timeout) if completed.
3480 */ 2925 */
3481long __sched 2926long __sched
3482wait_for_completion_killable_timeout(struct completion *x, 2927wait_for_completion_killable_timeout(struct completion *x,
@@ -3490,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3490 * try_wait_for_completion - try to decrement a completion without blocking 2935 * try_wait_for_completion - try to decrement a completion without blocking
3491 * @x: completion structure 2936 * @x: completion structure
3492 * 2937 *
3493 * Returns: 0 if a decrement cannot be done without blocking 2938 * Return: 0 if a decrement cannot be done without blocking
3494 * 1 if a decrement succeeded. 2939 * 1 if a decrement succeeded.
3495 * 2940 *
3496 * If a completion is being used as a counting completion, 2941 * If a completion is being used as a counting completion,
@@ -3517,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
3517 * completion_done - Test to see if a completion has any waiters 2962 * completion_done - Test to see if a completion has any waiters
3518 * @x: completion structure 2963 * @x: completion structure
3519 * 2964 *
3520 * Returns: 0 if there are waiters (wait_for_completion() in progress) 2965 * Return: 0 if there are waiters (wait_for_completion() in progress)
3521 * 1 if there are no waiters. 2966 * 1 if there are no waiters.
3522 * 2967 *
3523 */ 2968 */
@@ -3754,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment)
3754 * task_prio - return the priority value of a given task. 3199 * task_prio - return the priority value of a given task.
3755 * @p: the task in question. 3200 * @p: the task in question.
3756 * 3201 *
3757 * This is the priority value as seen by users in /proc. 3202 * Return: The priority value as seen by users in /proc.
3758 * RT tasks are offset by -200. Normal tasks are centered 3203 * RT tasks are offset by -200. Normal tasks are centered
3759 * around 0, value goes from -16 to +15. 3204 * around 0, value goes from -16 to +15.
3760 */ 3205 */
@@ -3766,6 +3211,8 @@ int task_prio(const struct task_struct *p)
3766/** 3211/**
3767 * task_nice - return the nice value of a given task. 3212 * task_nice - return the nice value of a given task.
3768 * @p: the task in question. 3213 * @p: the task in question.
3214 *
3215 * Return: The nice value [ -20 ... 0 ... 19 ].
3769 */ 3216 */
3770int task_nice(const struct task_struct *p) 3217int task_nice(const struct task_struct *p)
3771{ 3218{
@@ -3776,6 +3223,8 @@ EXPORT_SYMBOL(task_nice);
3776/** 3223/**
3777 * idle_cpu - is a given cpu idle currently? 3224 * idle_cpu - is a given cpu idle currently?
3778 * @cpu: the processor in question. 3225 * @cpu: the processor in question.
3226 *
3227 * Return: 1 if the CPU is currently idle. 0 otherwise.
3779 */ 3228 */
3780int idle_cpu(int cpu) 3229int idle_cpu(int cpu)
3781{ 3230{
@@ -3798,6 +3247,8 @@ int idle_cpu(int cpu)
3798/** 3247/**
3799 * idle_task - return the idle task for a given cpu. 3248 * idle_task - return the idle task for a given cpu.
3800 * @cpu: the processor in question. 3249 * @cpu: the processor in question.
3250 *
3251 * Return: The idle task for the cpu @cpu.
3801 */ 3252 */
3802struct task_struct *idle_task(int cpu) 3253struct task_struct *idle_task(int cpu)
3803{ 3254{
@@ -3807,6 +3258,8 @@ struct task_struct *idle_task(int cpu)
3807/** 3258/**
3808 * find_process_by_pid - find a process with a matching PID value. 3259 * find_process_by_pid - find a process with a matching PID value.
3809 * @pid: the pid in question. 3260 * @pid: the pid in question.
3261 *
3262 * The task of @pid, if found. %NULL otherwise.
3810 */ 3263 */
3811static struct task_struct *find_process_by_pid(pid_t pid) 3264static struct task_struct *find_process_by_pid(pid_t pid)
3812{ 3265{
@@ -4004,6 +3457,8 @@ recheck:
4004 * @policy: new policy. 3457 * @policy: new policy.
4005 * @param: structure containing the new RT priority. 3458 * @param: structure containing the new RT priority.
4006 * 3459 *
3460 * Return: 0 on success. An error code otherwise.
3461 *
4007 * NOTE that the task may be already dead. 3462 * NOTE that the task may be already dead.
4008 */ 3463 */
4009int sched_setscheduler(struct task_struct *p, int policy, 3464int sched_setscheduler(struct task_struct *p, int policy,
@@ -4023,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4023 * current context has permission. For example, this is needed in 3478 * current context has permission. For example, this is needed in
4024 * stop_machine(): we create temporary high priority worker threads, 3479 * stop_machine(): we create temporary high priority worker threads,
4025 * but our caller might not have that capability. 3480 * but our caller might not have that capability.
3481 *
3482 * Return: 0 on success. An error code otherwise.
4026 */ 3483 */
4027int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3484int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4028 const struct sched_param *param) 3485 const struct sched_param *param)
@@ -4057,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4057 * @pid: the pid in question. 3514 * @pid: the pid in question.
4058 * @policy: new policy. 3515 * @policy: new policy.
4059 * @param: structure containing the new RT priority. 3516 * @param: structure containing the new RT priority.
3517 *
3518 * Return: 0 on success. An error code otherwise.
4060 */ 3519 */
4061SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3520SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4062 struct sched_param __user *, param) 3521 struct sched_param __user *, param)
@@ -4072,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4072 * sys_sched_setparam - set/change the RT priority of a thread 3531 * sys_sched_setparam - set/change the RT priority of a thread
4073 * @pid: the pid in question. 3532 * @pid: the pid in question.
4074 * @param: structure containing the new RT priority. 3533 * @param: structure containing the new RT priority.
3534 *
3535 * Return: 0 on success. An error code otherwise.
4075 */ 3536 */
4076SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3537SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4077{ 3538{
@@ -4081,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4081/** 3542/**
4082 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3543 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4083 * @pid: the pid in question. 3544 * @pid: the pid in question.
3545 *
3546 * Return: On success, the policy of the thread. Otherwise, a negative error
3547 * code.
4084 */ 3548 */
4085SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3549SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4086{ 3550{
@@ -4107,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4107 * sys_sched_getparam - get the RT priority of a thread 3571 * sys_sched_getparam - get the RT priority of a thread
4108 * @pid: the pid in question. 3572 * @pid: the pid in question.
4109 * @param: structure containing the RT priority. 3573 * @param: structure containing the RT priority.
3574 *
3575 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3576 * code.
4110 */ 3577 */
4111SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3578SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4112{ 3579{
@@ -4231,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4231 * @pid: pid of the process 3698 * @pid: pid of the process
4232 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3699 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4233 * @user_mask_ptr: user-space pointer to the new cpu mask 3700 * @user_mask_ptr: user-space pointer to the new cpu mask
3701 *
3702 * Return: 0 on success. An error code otherwise.
4234 */ 3703 */
4235SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3704SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4236 unsigned long __user *, user_mask_ptr) 3705 unsigned long __user *, user_mask_ptr)
@@ -4282,6 +3751,8 @@ out_unlock:
4282 * @pid: pid of the process 3751 * @pid: pid of the process
4283 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3752 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4284 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3753 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3754 *
3755 * Return: 0 on success. An error code otherwise.
4285 */ 3756 */
4286SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 3757SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4287 unsigned long __user *, user_mask_ptr) 3758 unsigned long __user *, user_mask_ptr)
@@ -4316,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4316 * 3787 *
4317 * This function yields the current CPU to other tasks. If there are no 3788 * This function yields the current CPU to other tasks. If there are no
4318 * other threads running on this CPU then this function will return. 3789 * other threads running on this CPU then this function will return.
3790 *
3791 * Return: 0.
4319 */ 3792 */
4320SYSCALL_DEFINE0(sched_yield) 3793SYSCALL_DEFINE0(sched_yield)
4321{ 3794{
@@ -4441,7 +3914,7 @@ EXPORT_SYMBOL(yield);
4441 * It's the caller's job to ensure that the target task struct 3914 * It's the caller's job to ensure that the target task struct
4442 * can't go away on us before we can do any checks. 3915 * can't go away on us before we can do any checks.
4443 * 3916 *
4444 * Returns: 3917 * Return:
4445 * true (>0) if we indeed boosted the target task. 3918 * true (>0) if we indeed boosted the target task.
4446 * false (0) if we failed to boost the target. 3919 * false (0) if we failed to boost the target.
4447 * -ESRCH if there's no task to yield to. 3920 * -ESRCH if there's no task to yield to.
@@ -4544,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout)
4544 * sys_sched_get_priority_max - return maximum RT priority. 4017 * sys_sched_get_priority_max - return maximum RT priority.
4545 * @policy: scheduling class. 4018 * @policy: scheduling class.
4546 * 4019 *
4547 * this syscall returns the maximum rt_priority that can be used 4020 * Return: On success, this syscall returns the maximum
4548 * by a given scheduling class. 4021 * rt_priority that can be used by a given scheduling class.
4022 * On failure, a negative error code is returned.
4549 */ 4023 */
4550SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4024SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4551{ 4025{
@@ -4569,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4569 * sys_sched_get_priority_min - return minimum RT priority. 4043 * sys_sched_get_priority_min - return minimum RT priority.
4570 * @policy: scheduling class. 4044 * @policy: scheduling class.
4571 * 4045 *
4572 * this syscall returns the minimum rt_priority that can be used 4046 * Return: On success, this syscall returns the minimum
4573 * by a given scheduling class. 4047 * rt_priority that can be used by a given scheduling class.
4048 * On failure, a negative error code is returned.
4574 */ 4049 */
4575SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4050SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4576{ 4051{
@@ -4596,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4596 * 4071 *
4597 * this syscall writes the default timeslice value of a given process 4072 * this syscall writes the default timeslice value of a given process
4598 * into the user-space timespec buffer. A value of '0' means infinity. 4073 * into the user-space timespec buffer. A value of '0' means infinity.
4074 *
4075 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
4076 * an error code.
4599 */ 4077 */
4600SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4078SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4601 struct timespec __user *, interval) 4079 struct timespec __user *, interval)
@@ -4705,7 +4183,7 @@ void show_state_filter(unsigned long state_filter)
4705 debug_show_all_locks(); 4183 debug_show_all_locks();
4706} 4184}
4707 4185
4708void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4186void init_idle_bootup_task(struct task_struct *idle)
4709{ 4187{
4710 idle->sched_class = &idle_sched_class; 4188 idle->sched_class = &idle_sched_class;
4711} 4189}
@@ -4718,7 +4196,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4718 * NOTE: this function does not set the idle thread's NEED_RESCHED 4196 * NOTE: this function does not set the idle thread's NEED_RESCHED
4719 * flag, to make booting more robust. 4197 * flag, to make booting more robust.
4720 */ 4198 */
4721void __cpuinit init_idle(struct task_struct *idle, int cpu) 4199void init_idle(struct task_struct *idle, int cpu)
4722{ 4200{
4723 struct rq *rq = cpu_rq(cpu); 4201 struct rq *rq = cpu_rq(cpu);
4724 unsigned long flags; 4202 unsigned long flags;
@@ -4960,6 +4438,13 @@ static void migrate_tasks(unsigned int dead_cpu)
4960 */ 4438 */
4961 rq->stop = NULL; 4439 rq->stop = NULL;
4962 4440
4441 /*
4442 * put_prev_task() and pick_next_task() sched
4443 * class method both need to have an up-to-date
4444 * value of rq->clock[_task]
4445 */
4446 update_rq_clock(rq);
4447
4963 for ( ; ; ) { 4448 for ( ; ; ) {
4964 /* 4449 /*
4965 * There's this thread running, bail when that's the only 4450 * There's this thread running, bail when that's the only
@@ -5093,7 +4578,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5093 return table; 4578 return table;
5094} 4579}
5095 4580
5096static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4581static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5097{ 4582{
5098 struct ctl_table *entry, *table; 4583 struct ctl_table *entry, *table;
5099 struct sched_domain *sd; 4584 struct sched_domain *sd;
@@ -5195,7 +4680,7 @@ static void set_rq_offline(struct rq *rq)
5195 * migration_call - callback that gets triggered when a CPU is added. 4680 * migration_call - callback that gets triggered when a CPU is added.
5196 * Here we can start up the necessary migration thread for the new CPU. 4681 * Here we can start up the necessary migration thread for the new CPU.
5197 */ 4682 */
5198static int __cpuinit 4683static int
5199migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 4684migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5200{ 4685{
5201 int cpu = (long)hcpu; 4686 int cpu = (long)hcpu;
@@ -5249,12 +4734,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5249 * happens before everything else. This has to be lower priority than 4734 * happens before everything else. This has to be lower priority than
5250 * the notifier in the perf_event subsystem, though. 4735 * the notifier in the perf_event subsystem, though.
5251 */ 4736 */
5252static struct notifier_block __cpuinitdata migration_notifier = { 4737static struct notifier_block migration_notifier = {
5253 .notifier_call = migration_call, 4738 .notifier_call = migration_call,
5254 .priority = CPU_PRI_MIGRATION, 4739 .priority = CPU_PRI_MIGRATION,
5255}; 4740};
5256 4741
5257static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 4742static int sched_cpu_active(struct notifier_block *nfb,
5258 unsigned long action, void *hcpu) 4743 unsigned long action, void *hcpu)
5259{ 4744{
5260 switch (action & ~CPU_TASKS_FROZEN) { 4745 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5267,7 +4752,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5267 } 4752 }
5268} 4753}
5269 4754
5270static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 4755static int sched_cpu_inactive(struct notifier_block *nfb,
5271 unsigned long action, void *hcpu) 4756 unsigned long action, void *hcpu)
5272{ 4757{
5273 switch (action & ~CPU_TASKS_FROZEN) { 4758 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5907,7 +5392,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5907 get_group(cpu, sdd, &sd->groups); 5392 get_group(cpu, sdd, &sd->groups);
5908 atomic_inc(&sd->groups->ref); 5393 atomic_inc(&sd->groups->ref);
5909 5394
5910 if (cpu != cpumask_first(sched_domain_span(sd))) 5395 if (cpu != cpumask_first(span))
5911 return 0; 5396 return 0;
5912 5397
5913 lockdep_assert_held(&sched_domains_mutex); 5398 lockdep_assert_held(&sched_domains_mutex);
@@ -5917,12 +5402,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5917 5402
5918 for_each_cpu(i, span) { 5403 for_each_cpu(i, span) {
5919 struct sched_group *sg; 5404 struct sched_group *sg;
5920 int group = get_group(i, sdd, &sg); 5405 int group, j;
5921 int j;
5922 5406
5923 if (cpumask_test_cpu(i, covered)) 5407 if (cpumask_test_cpu(i, covered))
5924 continue; 5408 continue;
5925 5409
5410 group = get_group(i, sdd, &sg);
5926 cpumask_clear(sched_group_cpus(sg)); 5411 cpumask_clear(sched_group_cpus(sg));
5927 sg->sgp->power = 0; 5412 sg->sgp->power = 0;
5928 cpumask_setall(sched_group_mask(sg)); 5413 cpumask_setall(sched_group_mask(sg));
@@ -5960,7 +5445,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5960{ 5445{
5961 struct sched_group *sg = sd->groups; 5446 struct sched_group *sg = sd->groups;
5962 5447
5963 WARN_ON(!sd || !sg); 5448 WARN_ON(!sg);
5964 5449
5965 do { 5450 do {
5966 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5451 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6125,6 +5610,9 @@ static struct sched_domain_topology_level default_topology[] = {
6125 5610
6126static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5611static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6127 5612
5613#define for_each_sd_topology(tl) \
5614 for (tl = sched_domain_topology; tl->init; tl++)
5615
6128#ifdef CONFIG_NUMA 5616#ifdef CONFIG_NUMA
6129 5617
6130static int sched_domains_numa_levels; 5618static int sched_domains_numa_levels;
@@ -6422,7 +5910,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6422 struct sched_domain_topology_level *tl; 5910 struct sched_domain_topology_level *tl;
6423 int j; 5911 int j;
6424 5912
6425 for (tl = sched_domain_topology; tl->init; tl++) { 5913 for_each_sd_topology(tl) {
6426 struct sd_data *sdd = &tl->data; 5914 struct sd_data *sdd = &tl->data;
6427 5915
6428 sdd->sd = alloc_percpu(struct sched_domain *); 5916 sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6475,7 +5963,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
6475 struct sched_domain_topology_level *tl; 5963 struct sched_domain_topology_level *tl;
6476 int j; 5964 int j;
6477 5965
6478 for (tl = sched_domain_topology; tl->init; tl++) { 5966 for_each_sd_topology(tl) {
6479 struct sd_data *sdd = &tl->data; 5967 struct sd_data *sdd = &tl->data;
6480 5968
6481 for_each_cpu(j, cpu_map) { 5969 for_each_cpu(j, cpu_map) {
@@ -6503,9 +5991,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6503} 5991}
6504 5992
6505struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 5993struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6506 struct s_data *d, const struct cpumask *cpu_map, 5994 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6507 struct sched_domain_attr *attr, struct sched_domain *child, 5995 struct sched_domain *child, int cpu)
6508 int cpu)
6509{ 5996{
6510 struct sched_domain *sd = tl->init(tl, cpu); 5997 struct sched_domain *sd = tl->init(tl, cpu);
6511 if (!sd) 5998 if (!sd)
@@ -6516,8 +6003,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6516 sd->level = child->level + 1; 6003 sd->level = child->level + 1;
6517 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6004 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6518 child->parent = sd; 6005 child->parent = sd;
6006 sd->child = child;
6519 } 6007 }
6520 sd->child = child;
6521 set_domain_attribute(sd, attr); 6008 set_domain_attribute(sd, attr);
6522 6009
6523 return sd; 6010 return sd;
@@ -6530,7 +6017,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6530static int build_sched_domains(const struct cpumask *cpu_map, 6017static int build_sched_domains(const struct cpumask *cpu_map,
6531 struct sched_domain_attr *attr) 6018 struct sched_domain_attr *attr)
6532{ 6019{
6533 enum s_alloc alloc_state = sa_none; 6020 enum s_alloc alloc_state;
6534 struct sched_domain *sd; 6021 struct sched_domain *sd;
6535 struct s_data d; 6022 struct s_data d;
6536 int i, ret = -ENOMEM; 6023 int i, ret = -ENOMEM;
@@ -6544,18 +6031,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6544 struct sched_domain_topology_level *tl; 6031 struct sched_domain_topology_level *tl;
6545 6032
6546 sd = NULL; 6033 sd = NULL;
6547 for (tl = sched_domain_topology; tl->init; tl++) { 6034 for_each_sd_topology(tl) {
6548 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 6035 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6036 if (tl == sched_domain_topology)
6037 *per_cpu_ptr(d.sd, i) = sd;
6549 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6038 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6550 sd->flags |= SD_OVERLAP; 6039 sd->flags |= SD_OVERLAP;
6551 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6040 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6552 break; 6041 break;
6553 } 6042 }
6554
6555 while (sd->child)
6556 sd = sd->child;
6557
6558 *per_cpu_ptr(d.sd, i) = sd;
6559 } 6043 }
6560 6044
6561 /* Build the groups for the domains */ 6045 /* Build the groups for the domains */
@@ -6867,9 +6351,6 @@ void __init sched_init_smp(void)
6867 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6351 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6868 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6352 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6869 6353
6870 /* RT runtime code needs to handle some hotplug events */
6871 hotcpu_notifier(update_runtime, 0);
6872
6873 init_hrtick(); 6354 init_hrtick();
6874 6355
6875 /* Move init over to a non-isolated CPU */ 6356 /* Move init over to a non-isolated CPU */
@@ -7201,6 +6682,8 @@ void normalize_rt_tasks(void)
7201 * @cpu: the processor in question. 6682 * @cpu: the processor in question.
7202 * 6683 *
7203 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6684 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6685 *
6686 * Return: The current task for @cpu.
7204 */ 6687 */
7205struct task_struct *curr_task(int cpu) 6688struct task_struct *curr_task(int cpu)
7206{ 6689{