diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 765 |
1 files changed, 124 insertions, 641 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e8b335016c52..05c39f030314 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void) | |||
| 370 | #ifdef CONFIG_SCHED_HRTICK | 370 | #ifdef CONFIG_SCHED_HRTICK |
| 371 | /* | 371 | /* |
| 372 | * Use HR-timers to deliver accurate preemption points. | 372 | * Use HR-timers to deliver accurate preemption points. |
| 373 | * | ||
| 374 | * Its all a bit involved since we cannot program an hrt while holding the | ||
| 375 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
| 376 | * reschedule event. | ||
| 377 | * | ||
| 378 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
| 379 | * rq->lock. | ||
| 380 | */ | 373 | */ |
| 381 | 374 | ||
| 382 | static void hrtick_clear(struct rq *rq) | 375 | static void hrtick_clear(struct rq *rq) |
| @@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) | |||
| 404 | } | 397 | } |
| 405 | 398 | ||
| 406 | #ifdef CONFIG_SMP | 399 | #ifdef CONFIG_SMP |
| 400 | |||
| 401 | static int __hrtick_restart(struct rq *rq) | ||
| 402 | { | ||
| 403 | struct hrtimer *timer = &rq->hrtick_timer; | ||
| 404 | ktime_t time = hrtimer_get_softexpires(timer); | ||
| 405 | |||
| 406 | return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); | ||
| 407 | } | ||
| 408 | |||
| 407 | /* | 409 | /* |
| 408 | * called from hardirq (IPI) context | 410 | * called from hardirq (IPI) context |
| 409 | */ | 411 | */ |
| @@ -412,7 +414,7 @@ static void __hrtick_start(void *arg) | |||
| 412 | struct rq *rq = arg; | 414 | struct rq *rq = arg; |
| 413 | 415 | ||
| 414 | raw_spin_lock(&rq->lock); | 416 | raw_spin_lock(&rq->lock); |
| 415 | hrtimer_restart(&rq->hrtick_timer); | 417 | __hrtick_restart(rq); |
| 416 | rq->hrtick_csd_pending = 0; | 418 | rq->hrtick_csd_pending = 0; |
| 417 | raw_spin_unlock(&rq->lock); | 419 | raw_spin_unlock(&rq->lock); |
| 418 | } | 420 | } |
| @@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
| 430 | hrtimer_set_expires(timer, time); | 432 | hrtimer_set_expires(timer, time); |
| 431 | 433 | ||
| 432 | if (rq == this_rq()) { | 434 | if (rq == this_rq()) { |
| 433 | hrtimer_restart(timer); | 435 | __hrtick_restart(rq); |
| 434 | } else if (!rq->hrtick_csd_pending) { | 436 | } else if (!rq->hrtick_csd_pending) { |
| 435 | __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); | 437 | __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); |
| 436 | rq->hrtick_csd_pending = 1; | 438 | rq->hrtick_csd_pending = 1; |
| @@ -679,7 +681,7 @@ void sched_avg_update(struct rq *rq) | |||
| 679 | { | 681 | { |
| 680 | s64 period = sched_avg_period(); | 682 | s64 period = sched_avg_period(); |
| 681 | 683 | ||
| 682 | while ((s64)(rq->clock - rq->age_stamp) > period) { | 684 | while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { |
| 683 | /* | 685 | /* |
| 684 | * Inline assembly required to prevent the compiler | 686 | * Inline assembly required to prevent the compiler |
| 685 | * optimising this loop into a divmod call. | 687 | * optimising this loop into a divmod call. |
| @@ -931,6 +933,8 @@ static int effective_prio(struct task_struct *p) | |||
| 931 | /** | 933 | /** |
| 932 | * task_curr - is this task currently executing on a CPU? | 934 | * task_curr - is this task currently executing on a CPU? |
| 933 | * @p: the task in question. | 935 | * @p: the task in question. |
| 936 | * | ||
| 937 | * Return: 1 if the task is currently executing. 0 otherwise. | ||
| 934 | */ | 938 | */ |
| 935 | inline int task_curr(const struct task_struct *p) | 939 | inline int task_curr(const struct task_struct *p) |
| 936 | { | 940 | { |
| @@ -1340,7 +1344,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
| 1340 | p->sched_class->task_woken(rq, p); | 1344 | p->sched_class->task_woken(rq, p); |
| 1341 | 1345 | ||
| 1342 | if (rq->idle_stamp) { | 1346 | if (rq->idle_stamp) { |
| 1343 | u64 delta = rq->clock - rq->idle_stamp; | 1347 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
| 1344 | u64 max = 2*sysctl_sched_migration_cost; | 1348 | u64 max = 2*sysctl_sched_migration_cost; |
| 1345 | 1349 | ||
| 1346 | if (delta > max) | 1350 | if (delta > max) |
| @@ -1377,6 +1381,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
| 1377 | 1381 | ||
| 1378 | rq = __task_rq_lock(p); | 1382 | rq = __task_rq_lock(p); |
| 1379 | if (p->on_rq) { | 1383 | if (p->on_rq) { |
| 1384 | /* check_preempt_curr() may use rq clock */ | ||
| 1385 | update_rq_clock(rq); | ||
| 1380 | ttwu_do_wakeup(rq, p, wake_flags); | 1386 | ttwu_do_wakeup(rq, p, wake_flags); |
| 1381 | ret = 1; | 1387 | ret = 1; |
| 1382 | } | 1388 | } |
| @@ -1478,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
| 1478 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1484 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
| 1479 | * runnable without the overhead of this. | 1485 | * runnable without the overhead of this. |
| 1480 | * | 1486 | * |
| 1481 | * Returns %true if @p was woken up, %false if it was already running | 1487 | * Return: %true if @p was woken up, %false if it was already running. |
| 1482 | * or @state didn't match @p's state. | 1488 | * or @state didn't match @p's state. |
| 1483 | */ | 1489 | */ |
| 1484 | static int | 1490 | static int |
| @@ -1487,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1487 | unsigned long flags; | 1493 | unsigned long flags; |
| 1488 | int cpu, success = 0; | 1494 | int cpu, success = 0; |
| 1489 | 1495 | ||
| 1490 | smp_wmb(); | 1496 | /* |
| 1497 | * If we are going to wake up a thread waiting for CONDITION we | ||
| 1498 | * need to ensure that CONDITION=1 done by the caller can not be | ||
| 1499 | * reordered with p->state check below. This pairs with mb() in | ||
| 1500 | * set_current_state() the waiting thread does. | ||
| 1501 | */ | ||
| 1502 | smp_mb__before_spinlock(); | ||
| 1491 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 1503 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 1492 | if (!(p->state & state)) | 1504 | if (!(p->state & state)) |
| 1493 | goto out; | 1505 | goto out; |
| @@ -1573,8 +1585,9 @@ out: | |||
| 1573 | * @p: The process to be woken up. | 1585 | * @p: The process to be woken up. |
| 1574 | * | 1586 | * |
| 1575 | * Attempt to wake up the nominated process and move it to the set of runnable | 1587 | * Attempt to wake up the nominated process and move it to the set of runnable |
| 1576 | * processes. Returns 1 if the process was woken up, 0 if it was already | 1588 | * processes. |
| 1577 | * running. | 1589 | * |
| 1590 | * Return: 1 if the process was woken up, 0 if it was already running. | ||
| 1578 | * | 1591 | * |
| 1579 | * It may be assumed that this function implies a write memory barrier before | 1592 | * It may be assumed that this function implies a write memory barrier before |
| 1580 | * changing the task state if and only if any tasks are woken up. | 1593 | * changing the task state if and only if any tasks are woken up. |
| @@ -1609,15 +1622,6 @@ static void __sched_fork(struct task_struct *p) | |||
| 1609 | p->se.vruntime = 0; | 1622 | p->se.vruntime = 0; |
| 1610 | INIT_LIST_HEAD(&p->se.group_node); | 1623 | INIT_LIST_HEAD(&p->se.group_node); |
| 1611 | 1624 | ||
| 1612 | /* | ||
| 1613 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
| 1614 | * removed when useful for applications beyond shares distribution (e.g. | ||
| 1615 | * load-balance). | ||
| 1616 | */ | ||
| 1617 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
| 1618 | p->se.avg.runnable_avg_period = 0; | ||
| 1619 | p->se.avg.runnable_avg_sum = 0; | ||
| 1620 | #endif | ||
| 1621 | #ifdef CONFIG_SCHEDSTATS | 1625 | #ifdef CONFIG_SCHEDSTATS |
| 1622 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1626 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
| 1623 | #endif | 1627 | #endif |
| @@ -1761,6 +1765,8 @@ void wake_up_new_task(struct task_struct *p) | |||
| 1761 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1765 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
| 1762 | #endif | 1766 | #endif |
| 1763 | 1767 | ||
| 1768 | /* Initialize new task's runnable average */ | ||
| 1769 | init_task_runnable_average(p); | ||
| 1764 | rq = __task_rq_lock(p); | 1770 | rq = __task_rq_lock(p); |
| 1765 | activate_task(rq, p, 0); | 1771 | activate_task(rq, p, 0); |
| 1766 | p->on_rq = 1; | 1772 | p->on_rq = 1; |
| @@ -2069,575 +2075,6 @@ unsigned long nr_iowait_cpu(int cpu) | |||
| 2069 | return atomic_read(&this->nr_iowait); | 2075 | return atomic_read(&this->nr_iowait); |
| 2070 | } | 2076 | } |
| 2071 | 2077 | ||
| 2072 | unsigned long this_cpu_load(void) | ||
| 2073 | { | ||
| 2074 | struct rq *this = this_rq(); | ||
| 2075 | return this->cpu_load[0]; | ||
| 2076 | } | ||
| 2077 | |||
| 2078 | |||
| 2079 | /* | ||
| 2080 | * Global load-average calculations | ||
| 2081 | * | ||
| 2082 | * We take a distributed and async approach to calculating the global load-avg | ||
| 2083 | * in order to minimize overhead. | ||
| 2084 | * | ||
| 2085 | * The global load average is an exponentially decaying average of nr_running + | ||
| 2086 | * nr_uninterruptible. | ||
| 2087 | * | ||
| 2088 | * Once every LOAD_FREQ: | ||
| 2089 | * | ||
| 2090 | * nr_active = 0; | ||
| 2091 | * for_each_possible_cpu(cpu) | ||
| 2092 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
| 2093 | * | ||
| 2094 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
| 2095 | * | ||
| 2096 | * Due to a number of reasons the above turns in the mess below: | ||
| 2097 | * | ||
| 2098 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
| 2099 | * serious number of cpus, therefore we need to take a distributed approach | ||
| 2100 | * to calculating nr_active. | ||
| 2101 | * | ||
| 2102 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
| 2103 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
| 2104 | * | ||
| 2105 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
| 2106 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
| 2107 | * to obtain the same result. See calc_load_fold_active(). | ||
| 2108 | * | ||
| 2109 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
| 2110 | * across the machine, we assume 10 ticks is sufficient time for every | ||
| 2111 | * cpu to have completed this task. | ||
| 2112 | * | ||
| 2113 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
| 2114 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
| 2115 | * | ||
| 2116 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
| 2117 | * this would add another cross-cpu cacheline miss and atomic operation | ||
| 2118 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
| 2119 | * when it went into uninterruptible state and decrement on whatever cpu | ||
| 2120 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
| 2121 | * all cpus yields the correct result. | ||
| 2122 | * | ||
| 2123 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
| 2124 | */ | ||
| 2125 | |||
| 2126 | /* Variables and functions for calc_load */ | ||
| 2127 | static atomic_long_t calc_load_tasks; | ||
| 2128 | static unsigned long calc_load_update; | ||
| 2129 | unsigned long avenrun[3]; | ||
| 2130 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
| 2131 | |||
| 2132 | /** | ||
| 2133 | * get_avenrun - get the load average array | ||
| 2134 | * @loads: pointer to dest load array | ||
| 2135 | * @offset: offset to add | ||
| 2136 | * @shift: shift count to shift the result left | ||
| 2137 | * | ||
| 2138 | * These values are estimates at best, so no need for locking. | ||
| 2139 | */ | ||
| 2140 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2141 | { | ||
| 2142 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2143 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2144 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2145 | } | ||
| 2146 | |||
| 2147 | static long calc_load_fold_active(struct rq *this_rq) | ||
| 2148 | { | ||
| 2149 | long nr_active, delta = 0; | ||
| 2150 | |||
| 2151 | nr_active = this_rq->nr_running; | ||
| 2152 | nr_active += (long) this_rq->nr_uninterruptible; | ||
| 2153 | |||
| 2154 | if (nr_active != this_rq->calc_load_active) { | ||
| 2155 | delta = nr_active - this_rq->calc_load_active; | ||
| 2156 | this_rq->calc_load_active = nr_active; | ||
| 2157 | } | ||
| 2158 | |||
| 2159 | return delta; | ||
| 2160 | } | ||
| 2161 | |||
| 2162 | /* | ||
| 2163 | * a1 = a0 * e + a * (1 - e) | ||
| 2164 | */ | ||
| 2165 | static unsigned long | ||
| 2166 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
| 2167 | { | ||
| 2168 | load *= exp; | ||
| 2169 | load += active * (FIXED_1 - exp); | ||
| 2170 | load += 1UL << (FSHIFT - 1); | ||
| 2171 | return load >> FSHIFT; | ||
| 2172 | } | ||
| 2173 | |||
| 2174 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 2175 | /* | ||
| 2176 | * Handle NO_HZ for the global load-average. | ||
| 2177 | * | ||
| 2178 | * Since the above described distributed algorithm to compute the global | ||
| 2179 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
| 2180 | * NO_HZ. | ||
| 2181 | * | ||
| 2182 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
| 2183 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
| 2184 | * when we read the global state. | ||
| 2185 | * | ||
| 2186 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
| 2187 | * | ||
| 2188 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
| 2189 | * contribution, causing under-accounting. | ||
| 2190 | * | ||
| 2191 | * We avoid this by keeping two idle-delta counters and flipping them | ||
| 2192 | * when the window starts, thus separating old and new NO_HZ load. | ||
| 2193 | * | ||
| 2194 | * The only trick is the slight shift in index flip for read vs write. | ||
| 2195 | * | ||
| 2196 | * 0s 5s 10s 15s | ||
| 2197 | * +10 +10 +10 +10 | ||
| 2198 | * |-|-----------|-|-----------|-|-----------|-| | ||
| 2199 | * r:0 0 1 1 0 0 1 1 0 | ||
| 2200 | * w:0 1 1 0 0 1 1 0 0 | ||
| 2201 | * | ||
| 2202 | * This ensures we'll fold the old idle contribution in this window while | ||
| 2203 | * accumlating the new one. | ||
| 2204 | * | ||
| 2205 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
| 2206 | * contribution, since we effectively move our sample point to a known | ||
| 2207 | * busy state. | ||
| 2208 | * | ||
| 2209 | * This is solved by pushing the window forward, and thus skipping the | ||
| 2210 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
| 2211 | * was in effect at the time the window opened). This also solves the issue | ||
| 2212 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
| 2213 | * LOAD_FREQ intervals. | ||
| 2214 | * | ||
| 2215 | * When making the ILB scale, we should try to pull this in as well. | ||
| 2216 | */ | ||
| 2217 | static atomic_long_t calc_load_idle[2]; | ||
| 2218 | static int calc_load_idx; | ||
| 2219 | |||
| 2220 | static inline int calc_load_write_idx(void) | ||
| 2221 | { | ||
| 2222 | int idx = calc_load_idx; | ||
| 2223 | |||
| 2224 | /* | ||
| 2225 | * See calc_global_nohz(), if we observe the new index, we also | ||
| 2226 | * need to observe the new update time. | ||
| 2227 | */ | ||
| 2228 | smp_rmb(); | ||
| 2229 | |||
| 2230 | /* | ||
| 2231 | * If the folding window started, make sure we start writing in the | ||
| 2232 | * next idle-delta. | ||
| 2233 | */ | ||
| 2234 | if (!time_before(jiffies, calc_load_update)) | ||
| 2235 | idx++; | ||
| 2236 | |||
| 2237 | return idx & 1; | ||
| 2238 | } | ||
| 2239 | |||
| 2240 | static inline int calc_load_read_idx(void) | ||
| 2241 | { | ||
| 2242 | return calc_load_idx & 1; | ||
| 2243 | } | ||
| 2244 | |||
| 2245 | void calc_load_enter_idle(void) | ||
| 2246 | { | ||
| 2247 | struct rq *this_rq = this_rq(); | ||
| 2248 | long delta; | ||
| 2249 | |||
| 2250 | /* | ||
| 2251 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
| 2252 | * into the pending idle delta. | ||
| 2253 | */ | ||
| 2254 | delta = calc_load_fold_active(this_rq); | ||
| 2255 | if (delta) { | ||
| 2256 | int idx = calc_load_write_idx(); | ||
| 2257 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
| 2258 | } | ||
| 2259 | } | ||
| 2260 | |||
| 2261 | void calc_load_exit_idle(void) | ||
| 2262 | { | ||
| 2263 | struct rq *this_rq = this_rq(); | ||
| 2264 | |||
| 2265 | /* | ||
| 2266 | * If we're still before the sample window, we're done. | ||
| 2267 | */ | ||
| 2268 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
| 2269 | return; | ||
| 2270 | |||
| 2271 | /* | ||
| 2272 | * We woke inside or after the sample window, this means we're already | ||
| 2273 | * accounted through the nohz accounting, so skip the entire deal and | ||
| 2274 | * sync up for the next window. | ||
| 2275 | */ | ||
| 2276 | this_rq->calc_load_update = calc_load_update; | ||
| 2277 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
| 2278 | this_rq->calc_load_update += LOAD_FREQ; | ||
| 2279 | } | ||
| 2280 | |||
| 2281 | static long calc_load_fold_idle(void) | ||
| 2282 | { | ||
| 2283 | int idx = calc_load_read_idx(); | ||
| 2284 | long delta = 0; | ||
| 2285 | |||
| 2286 | if (atomic_long_read(&calc_load_idle[idx])) | ||
| 2287 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
| 2288 | |||
| 2289 | return delta; | ||
| 2290 | } | ||
| 2291 | |||
| 2292 | /** | ||
| 2293 | * fixed_power_int - compute: x^n, in O(log n) time | ||
| 2294 | * | ||
| 2295 | * @x: base of the power | ||
| 2296 | * @frac_bits: fractional bits of @x | ||
| 2297 | * @n: power to raise @x to. | ||
| 2298 | * | ||
| 2299 | * By exploiting the relation between the definition of the natural power | ||
| 2300 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
| 2301 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
| 2302 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
| 2303 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
| 2304 | * of course trivially computable in O(log_2 n), the length of our binary | ||
| 2305 | * vector. | ||
| 2306 | */ | ||
| 2307 | static unsigned long | ||
| 2308 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
| 2309 | { | ||
| 2310 | unsigned long result = 1UL << frac_bits; | ||
| 2311 | |||
| 2312 | if (n) for (;;) { | ||
| 2313 | if (n & 1) { | ||
| 2314 | result *= x; | ||
| 2315 | result += 1UL << (frac_bits - 1); | ||
| 2316 | result >>= frac_bits; | ||
| 2317 | } | ||
| 2318 | n >>= 1; | ||
| 2319 | if (!n) | ||
| 2320 | break; | ||
| 2321 | x *= x; | ||
| 2322 | x += 1UL << (frac_bits - 1); | ||
| 2323 | x >>= frac_bits; | ||
| 2324 | } | ||
| 2325 | |||
| 2326 | return result; | ||
| 2327 | } | ||
| 2328 | |||
| 2329 | /* | ||
| 2330 | * a1 = a0 * e + a * (1 - e) | ||
| 2331 | * | ||
| 2332 | * a2 = a1 * e + a * (1 - e) | ||
| 2333 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
| 2334 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
| 2335 | * | ||
| 2336 | * a3 = a2 * e + a * (1 - e) | ||
| 2337 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
| 2338 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
| 2339 | * | ||
| 2340 | * ... | ||
| 2341 | * | ||
| 2342 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
| 2343 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
| 2344 | * = a0 * e^n + a * (1 - e^n) | ||
| 2345 | * | ||
| 2346 | * [1] application of the geometric series: | ||
| 2347 | * | ||
| 2348 | * n 1 - x^(n+1) | ||
| 2349 | * S_n := \Sum x^i = ------------- | ||
| 2350 | * i=0 1 - x | ||
| 2351 | */ | ||
| 2352 | static unsigned long | ||
| 2353 | calc_load_n(unsigned long load, unsigned long exp, | ||
| 2354 | unsigned long active, unsigned int n) | ||
| 2355 | { | ||
| 2356 | |||
| 2357 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
| 2358 | } | ||
| 2359 | |||
| 2360 | /* | ||
| 2361 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
| 2362 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
| 2363 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
| 2364 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
| 2365 | * | ||
| 2366 | * Once we've updated the global active value, we need to apply the exponential | ||
| 2367 | * weights adjusted to the number of cycles missed. | ||
| 2368 | */ | ||
| 2369 | static void calc_global_nohz(void) | ||
| 2370 | { | ||
| 2371 | long delta, active, n; | ||
| 2372 | |||
| 2373 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
| 2374 | /* | ||
| 2375 | * Catch-up, fold however many we are behind still | ||
| 2376 | */ | ||
| 2377 | delta = jiffies - calc_load_update - 10; | ||
| 2378 | n = 1 + (delta / LOAD_FREQ); | ||
| 2379 | |||
| 2380 | active = atomic_long_read(&calc_load_tasks); | ||
| 2381 | active = active > 0 ? active * FIXED_1 : 0; | ||
| 2382 | |||
| 2383 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
| 2384 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
| 2385 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
| 2386 | |||
| 2387 | calc_load_update += n * LOAD_FREQ; | ||
| 2388 | } | ||
| 2389 | |||
| 2390 | /* | ||
| 2391 | * Flip the idle index... | ||
| 2392 | * | ||
| 2393 | * Make sure we first write the new time then flip the index, so that | ||
| 2394 | * calc_load_write_idx() will see the new time when it reads the new | ||
| 2395 | * index, this avoids a double flip messing things up. | ||
| 2396 | */ | ||
| 2397 | smp_wmb(); | ||
| 2398 | calc_load_idx++; | ||
| 2399 | } | ||
| 2400 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
| 2401 | |||
| 2402 | static inline long calc_load_fold_idle(void) { return 0; } | ||
| 2403 | static inline void calc_global_nohz(void) { } | ||
| 2404 | |||
| 2405 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
| 2406 | |||
| 2407 | /* | ||
| 2408 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
| 2409 | * CPUs have updated calc_load_tasks. | ||
| 2410 | */ | ||
| 2411 | void calc_global_load(unsigned long ticks) | ||
| 2412 | { | ||
| 2413 | long active, delta; | ||
| 2414 | |||
| 2415 | if (time_before(jiffies, calc_load_update + 10)) | ||
| 2416 | return; | ||
| 2417 | |||
| 2418 | /* | ||
| 2419 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
| 2420 | */ | ||
| 2421 | delta = calc_load_fold_idle(); | ||
| 2422 | if (delta) | ||
| 2423 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2424 | |||
| 2425 | active = atomic_long_read(&calc_load_tasks); | ||
| 2426 | active = active > 0 ? active * FIXED_1 : 0; | ||
| 2427 | |||
| 2428 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
| 2429 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
| 2430 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
| 2431 | |||
| 2432 | calc_load_update += LOAD_FREQ; | ||
| 2433 | |||
| 2434 | /* | ||
| 2435 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
| 2436 | */ | ||
| 2437 | calc_global_nohz(); | ||
| 2438 | } | ||
| 2439 | |||
| 2440 | /* | ||
| 2441 | * Called from update_cpu_load() to periodically update this CPU's | ||
| 2442 | * active count. | ||
| 2443 | */ | ||
| 2444 | static void calc_load_account_active(struct rq *this_rq) | ||
| 2445 | { | ||
| 2446 | long delta; | ||
| 2447 | |||
| 2448 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
| 2449 | return; | ||
| 2450 | |||
| 2451 | delta = calc_load_fold_active(this_rq); | ||
| 2452 | if (delta) | ||
| 2453 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2454 | |||
| 2455 | this_rq->calc_load_update += LOAD_FREQ; | ||
| 2456 | } | ||
| 2457 | |||
| 2458 | /* | ||
| 2459 | * End of global load-average stuff | ||
| 2460 | */ | ||
| 2461 | |||
| 2462 | /* | ||
| 2463 | * The exact cpuload at various idx values, calculated at every tick would be | ||
| 2464 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
| 2465 | * | ||
| 2466 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
| 2467 | * on nth tick when cpu may be busy, then we have: | ||
| 2468 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
| 2469 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
| 2470 | * | ||
| 2471 | * decay_load_missed() below does efficient calculation of | ||
| 2472 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
| 2473 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
| 2474 | * | ||
| 2475 | * The calculation is approximated on a 128 point scale. | ||
| 2476 | * degrade_zero_ticks is the number of ticks after which load at any | ||
| 2477 | * particular idx is approximated to be zero. | ||
| 2478 | * degrade_factor is a precomputed table, a row for each load idx. | ||
| 2479 | * Each column corresponds to degradation factor for a power of two ticks, | ||
| 2480 | * based on 128 point scale. | ||
| 2481 | * Example: | ||
| 2482 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
| 2483 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
| 2484 | * | ||
| 2485 | * With this power of 2 load factors, we can degrade the load n times | ||
| 2486 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
| 2487 | * n mult/shifts needed by the exact degradation. | ||
| 2488 | */ | ||
| 2489 | #define DEGRADE_SHIFT 7 | ||
| 2490 | static const unsigned char | ||
| 2491 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
| 2492 | static const unsigned char | ||
| 2493 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
| 2494 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
| 2495 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
| 2496 | {96, 72, 40, 12, 1, 0, 0}, | ||
| 2497 | {112, 98, 75, 43, 15, 1, 0}, | ||
| 2498 | {120, 112, 98, 76, 45, 16, 2} }; | ||
| 2499 | |||
| 2500 | /* | ||
| 2501 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
| 2502 | * would be when CPU is idle and so we just decay the old load without | ||
| 2503 | * adding any new load. | ||
| 2504 | */ | ||
| 2505 | static unsigned long | ||
| 2506 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
| 2507 | { | ||
| 2508 | int j = 0; | ||
| 2509 | |||
| 2510 | if (!missed_updates) | ||
| 2511 | return load; | ||
| 2512 | |||
| 2513 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
| 2514 | return 0; | ||
| 2515 | |||
| 2516 | if (idx == 1) | ||
| 2517 | return load >> missed_updates; | ||
| 2518 | |||
| 2519 | while (missed_updates) { | ||
| 2520 | if (missed_updates % 2) | ||
| 2521 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
| 2522 | |||
| 2523 | missed_updates >>= 1; | ||
| 2524 | j++; | ||
| 2525 | } | ||
| 2526 | return load; | ||
| 2527 | } | ||
| 2528 | |||
| 2529 | /* | ||
| 2530 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
| 2531 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
| 2532 | * every tick. We fix it up based on jiffies. | ||
| 2533 | */ | ||
| 2534 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
| 2535 | unsigned long pending_updates) | ||
| 2536 | { | ||
| 2537 | int i, scale; | ||
| 2538 | |||
| 2539 | this_rq->nr_load_updates++; | ||
| 2540 | |||
| 2541 | /* Update our load: */ | ||
| 2542 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
| 2543 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
| 2544 | unsigned long old_load, new_load; | ||
| 2545 | |||
| 2546 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
| 2547 | |||
| 2548 | old_load = this_rq->cpu_load[i]; | ||
| 2549 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
| 2550 | new_load = this_load; | ||
| 2551 | /* | ||
| 2552 | * Round up the averaging division if load is increasing. This | ||
| 2553 | * prevents us from getting stuck on 9 if the load is 10, for | ||
| 2554 | * example. | ||
| 2555 | */ | ||
| 2556 | if (new_load > old_load) | ||
| 2557 | new_load += scale - 1; | ||
| 2558 | |||
| 2559 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
| 2560 | } | ||
| 2561 | |||
| 2562 | sched_avg_update(this_rq); | ||
| 2563 | } | ||
| 2564 | |||
| 2565 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 2566 | /* | ||
| 2567 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
| 2568 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
| 2569 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
| 2570 | * | ||
| 2571 | * Therefore we cannot use the delta approach from the regular tick since that | ||
| 2572 | * would seriously skew the load calculation. However we'll make do for those | ||
| 2573 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
| 2574 | * (tick_nohz_idle_exit). | ||
| 2575 | * | ||
| 2576 | * This means we might still be one tick off for nohz periods. | ||
| 2577 | */ | ||
| 2578 | |||
| 2579 | /* | ||
| 2580 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
| 2581 | * idle balance. | ||
| 2582 | */ | ||
| 2583 | void update_idle_cpu_load(struct rq *this_rq) | ||
| 2584 | { | ||
| 2585 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
| 2586 | unsigned long load = this_rq->load.weight; | ||
| 2587 | unsigned long pending_updates; | ||
| 2588 | |||
| 2589 | /* | ||
| 2590 | * bail if there's load or we're actually up-to-date. | ||
| 2591 | */ | ||
| 2592 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
| 2593 | return; | ||
| 2594 | |||
| 2595 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 2596 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 2597 | |||
| 2598 | __update_cpu_load(this_rq, load, pending_updates); | ||
| 2599 | } | ||
| 2600 | |||
| 2601 | /* | ||
| 2602 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
| 2603 | */ | ||
| 2604 | void update_cpu_load_nohz(void) | ||
| 2605 | { | ||
| 2606 | struct rq *this_rq = this_rq(); | ||
| 2607 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
| 2608 | unsigned long pending_updates; | ||
| 2609 | |||
| 2610 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
| 2611 | return; | ||
| 2612 | |||
| 2613 | raw_spin_lock(&this_rq->lock); | ||
| 2614 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 2615 | if (pending_updates) { | ||
| 2616 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 2617 | /* | ||
| 2618 | * We were idle, this means load 0, the current load might be | ||
| 2619 | * !0 due to remote wakeups and the sort. | ||
| 2620 | */ | ||
| 2621 | __update_cpu_load(this_rq, 0, pending_updates); | ||
| 2622 | } | ||
| 2623 | raw_spin_unlock(&this_rq->lock); | ||
| 2624 | } | ||
| 2625 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
| 2626 | |||
| 2627 | /* | ||
| 2628 | * Called from scheduler_tick() | ||
| 2629 | */ | ||
| 2630 | static void update_cpu_load_active(struct rq *this_rq) | ||
| 2631 | { | ||
| 2632 | /* | ||
| 2633 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
| 2634 | */ | ||
| 2635 | this_rq->last_load_update_tick = jiffies; | ||
| 2636 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
| 2637 | |||
| 2638 | calc_load_account_active(this_rq); | ||
| 2639 | } | ||
| 2640 | |||
| 2641 | #ifdef CONFIG_SMP | 2078 | #ifdef CONFIG_SMP |
| 2642 | 2079 | ||
| 2643 | /* | 2080 | /* |
| @@ -2686,7 +2123,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
| 2686 | 2123 | ||
| 2687 | if (task_current(rq, p)) { | 2124 | if (task_current(rq, p)) { |
| 2688 | update_rq_clock(rq); | 2125 | update_rq_clock(rq); |
| 2689 | ns = rq->clock_task - p->se.exec_start; | 2126 | ns = rq_clock_task(rq) - p->se.exec_start; |
| 2690 | if ((s64)ns < 0) | 2127 | if ((s64)ns < 0) |
| 2691 | ns = 0; | 2128 | ns = 0; |
| 2692 | } | 2129 | } |
| @@ -2739,8 +2176,8 @@ void scheduler_tick(void) | |||
| 2739 | 2176 | ||
| 2740 | raw_spin_lock(&rq->lock); | 2177 | raw_spin_lock(&rq->lock); |
| 2741 | update_rq_clock(rq); | 2178 | update_rq_clock(rq); |
| 2742 | update_cpu_load_active(rq); | ||
| 2743 | curr->sched_class->task_tick(rq, curr, 0); | 2179 | curr->sched_class->task_tick(rq, curr, 0); |
| 2180 | update_cpu_load_active(rq); | ||
| 2744 | raw_spin_unlock(&rq->lock); | 2181 | raw_spin_unlock(&rq->lock); |
| 2745 | 2182 | ||
| 2746 | perf_event_task_tick(); | 2183 | perf_event_task_tick(); |
| @@ -2763,6 +2200,8 @@ void scheduler_tick(void) | |||
| 2763 | * This makes sure that uptime, CFS vruntime, load | 2200 | * This makes sure that uptime, CFS vruntime, load |
| 2764 | * balancing, etc... continue to move forward, even | 2201 | * balancing, etc... continue to move forward, even |
| 2765 | * with a very low granularity. | 2202 | * with a very low granularity. |
| 2203 | * | ||
| 2204 | * Return: Maximum deferment in nanoseconds. | ||
| 2766 | */ | 2205 | */ |
| 2767 | u64 scheduler_tick_max_deferment(void) | 2206 | u64 scheduler_tick_max_deferment(void) |
| 2768 | { | 2207 | { |
| @@ -2966,6 +2405,12 @@ need_resched: | |||
| 2966 | if (sched_feat(HRTICK)) | 2405 | if (sched_feat(HRTICK)) |
| 2967 | hrtick_clear(rq); | 2406 | hrtick_clear(rq); |
| 2968 | 2407 | ||
| 2408 | /* | ||
| 2409 | * Make sure that signal_pending_state()->signal_pending() below | ||
| 2410 | * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) | ||
| 2411 | * done by the caller to avoid the race with signal_wake_up(). | ||
| 2412 | */ | ||
| 2413 | smp_mb__before_spinlock(); | ||
| 2969 | raw_spin_lock_irq(&rq->lock); | 2414 | raw_spin_lock_irq(&rq->lock); |
| 2970 | 2415 | ||
| 2971 | switch_count = &prev->nivcsw; | 2416 | switch_count = &prev->nivcsw; |
| @@ -3368,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion); | |||
| 3368 | * specified timeout to expire. The timeout is in jiffies. It is not | 2813 | * specified timeout to expire. The timeout is in jiffies. It is not |
| 3369 | * interruptible. | 2814 | * interruptible. |
| 3370 | * | 2815 | * |
| 3371 | * The return value is 0 if timed out, and positive (at least 1, or number of | 2816 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
| 3372 | * jiffies left till timeout) if completed. | 2817 | * till timeout) if completed. |
| 3373 | */ | 2818 | */ |
| 3374 | unsigned long __sched | 2819 | unsigned long __sched |
| 3375 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 2820 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
| @@ -3401,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io); | |||
| 3401 | * specified timeout to expire. The timeout is in jiffies. It is not | 2846 | * specified timeout to expire. The timeout is in jiffies. It is not |
| 3402 | * interruptible. The caller is accounted as waiting for IO. | 2847 | * interruptible. The caller is accounted as waiting for IO. |
| 3403 | * | 2848 | * |
| 3404 | * The return value is 0 if timed out, and positive (at least 1, or number of | 2849 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
| 3405 | * jiffies left till timeout) if completed. | 2850 | * till timeout) if completed. |
| 3406 | */ | 2851 | */ |
| 3407 | unsigned long __sched | 2852 | unsigned long __sched |
| 3408 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | 2853 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) |
| @@ -3418,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); | |||
| 3418 | * This waits for completion of a specific task to be signaled. It is | 2863 | * This waits for completion of a specific task to be signaled. It is |
| 3419 | * interruptible. | 2864 | * interruptible. |
| 3420 | * | 2865 | * |
| 3421 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | 2866 | * Return: -ERESTARTSYS if interrupted, 0 if completed. |
| 3422 | */ | 2867 | */ |
| 3423 | int __sched wait_for_completion_interruptible(struct completion *x) | 2868 | int __sched wait_for_completion_interruptible(struct completion *x) |
| 3424 | { | 2869 | { |
| @@ -3437,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
| 3437 | * This waits for either a completion of a specific task to be signaled or for a | 2882 | * This waits for either a completion of a specific task to be signaled or for a |
| 3438 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 2883 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
| 3439 | * | 2884 | * |
| 3440 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | 2885 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, |
| 3441 | * positive (at least 1, or number of jiffies left till timeout) if completed. | 2886 | * or number of jiffies left till timeout) if completed. |
| 3442 | */ | 2887 | */ |
| 3443 | long __sched | 2888 | long __sched |
| 3444 | wait_for_completion_interruptible_timeout(struct completion *x, | 2889 | wait_for_completion_interruptible_timeout(struct completion *x, |
| @@ -3455,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | |||
| 3455 | * This waits to be signaled for completion of a specific task. It can be | 2900 | * This waits to be signaled for completion of a specific task. It can be |
| 3456 | * interrupted by a kill signal. | 2901 | * interrupted by a kill signal. |
| 3457 | * | 2902 | * |
| 3458 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | 2903 | * Return: -ERESTARTSYS if interrupted, 0 if completed. |
| 3459 | */ | 2904 | */ |
| 3460 | int __sched wait_for_completion_killable(struct completion *x) | 2905 | int __sched wait_for_completion_killable(struct completion *x) |
| 3461 | { | 2906 | { |
| @@ -3475,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
| 3475 | * signaled or for a specified timeout to expire. It can be | 2920 | * signaled or for a specified timeout to expire. It can be |
| 3476 | * interrupted by a kill signal. The timeout is in jiffies. | 2921 | * interrupted by a kill signal. The timeout is in jiffies. |
| 3477 | * | 2922 | * |
| 3478 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | 2923 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, |
| 3479 | * positive (at least 1, or number of jiffies left till timeout) if completed. | 2924 | * or number of jiffies left till timeout) if completed. |
| 3480 | */ | 2925 | */ |
| 3481 | long __sched | 2926 | long __sched |
| 3482 | wait_for_completion_killable_timeout(struct completion *x, | 2927 | wait_for_completion_killable_timeout(struct completion *x, |
| @@ -3490,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); | |||
| 3490 | * try_wait_for_completion - try to decrement a completion without blocking | 2935 | * try_wait_for_completion - try to decrement a completion without blocking |
| 3491 | * @x: completion structure | 2936 | * @x: completion structure |
| 3492 | * | 2937 | * |
| 3493 | * Returns: 0 if a decrement cannot be done without blocking | 2938 | * Return: 0 if a decrement cannot be done without blocking |
| 3494 | * 1 if a decrement succeeded. | 2939 | * 1 if a decrement succeeded. |
| 3495 | * | 2940 | * |
| 3496 | * If a completion is being used as a counting completion, | 2941 | * If a completion is being used as a counting completion, |
| @@ -3517,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion); | |||
| 3517 | * completion_done - Test to see if a completion has any waiters | 2962 | * completion_done - Test to see if a completion has any waiters |
| 3518 | * @x: completion structure | 2963 | * @x: completion structure |
| 3519 | * | 2964 | * |
| 3520 | * Returns: 0 if there are waiters (wait_for_completion() in progress) | 2965 | * Return: 0 if there are waiters (wait_for_completion() in progress) |
| 3521 | * 1 if there are no waiters. | 2966 | * 1 if there are no waiters. |
| 3522 | * | 2967 | * |
| 3523 | */ | 2968 | */ |
| @@ -3754,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
| 3754 | * task_prio - return the priority value of a given task. | 3199 | * task_prio - return the priority value of a given task. |
| 3755 | * @p: the task in question. | 3200 | * @p: the task in question. |
| 3756 | * | 3201 | * |
| 3757 | * This is the priority value as seen by users in /proc. | 3202 | * Return: The priority value as seen by users in /proc. |
| 3758 | * RT tasks are offset by -200. Normal tasks are centered | 3203 | * RT tasks are offset by -200. Normal tasks are centered |
| 3759 | * around 0, value goes from -16 to +15. | 3204 | * around 0, value goes from -16 to +15. |
| 3760 | */ | 3205 | */ |
| @@ -3766,6 +3211,8 @@ int task_prio(const struct task_struct *p) | |||
| 3766 | /** | 3211 | /** |
| 3767 | * task_nice - return the nice value of a given task. | 3212 | * task_nice - return the nice value of a given task. |
| 3768 | * @p: the task in question. | 3213 | * @p: the task in question. |
| 3214 | * | ||
| 3215 | * Return: The nice value [ -20 ... 0 ... 19 ]. | ||
| 3769 | */ | 3216 | */ |
| 3770 | int task_nice(const struct task_struct *p) | 3217 | int task_nice(const struct task_struct *p) |
| 3771 | { | 3218 | { |
| @@ -3776,6 +3223,8 @@ EXPORT_SYMBOL(task_nice); | |||
| 3776 | /** | 3223 | /** |
| 3777 | * idle_cpu - is a given cpu idle currently? | 3224 | * idle_cpu - is a given cpu idle currently? |
| 3778 | * @cpu: the processor in question. | 3225 | * @cpu: the processor in question. |
| 3226 | * | ||
| 3227 | * Return: 1 if the CPU is currently idle. 0 otherwise. | ||
| 3779 | */ | 3228 | */ |
| 3780 | int idle_cpu(int cpu) | 3229 | int idle_cpu(int cpu) |
| 3781 | { | 3230 | { |
| @@ -3798,6 +3247,8 @@ int idle_cpu(int cpu) | |||
| 3798 | /** | 3247 | /** |
| 3799 | * idle_task - return the idle task for a given cpu. | 3248 | * idle_task - return the idle task for a given cpu. |
| 3800 | * @cpu: the processor in question. | 3249 | * @cpu: the processor in question. |
| 3250 | * | ||
| 3251 | * Return: The idle task for the cpu @cpu. | ||
| 3801 | */ | 3252 | */ |
| 3802 | struct task_struct *idle_task(int cpu) | 3253 | struct task_struct *idle_task(int cpu) |
| 3803 | { | 3254 | { |
| @@ -3807,6 +3258,8 @@ struct task_struct *idle_task(int cpu) | |||
| 3807 | /** | 3258 | /** |
| 3808 | * find_process_by_pid - find a process with a matching PID value. | 3259 | * find_process_by_pid - find a process with a matching PID value. |
| 3809 | * @pid: the pid in question. | 3260 | * @pid: the pid in question. |
| 3261 | * | ||
| 3262 | * The task of @pid, if found. %NULL otherwise. | ||
| 3810 | */ | 3263 | */ |
| 3811 | static struct task_struct *find_process_by_pid(pid_t pid) | 3264 | static struct task_struct *find_process_by_pid(pid_t pid) |
| 3812 | { | 3265 | { |
| @@ -4004,6 +3457,8 @@ recheck: | |||
| 4004 | * @policy: new policy. | 3457 | * @policy: new policy. |
| 4005 | * @param: structure containing the new RT priority. | 3458 | * @param: structure containing the new RT priority. |
| 4006 | * | 3459 | * |
| 3460 | * Return: 0 on success. An error code otherwise. | ||
| 3461 | * | ||
| 4007 | * NOTE that the task may be already dead. | 3462 | * NOTE that the task may be already dead. |
| 4008 | */ | 3463 | */ |
| 4009 | int sched_setscheduler(struct task_struct *p, int policy, | 3464 | int sched_setscheduler(struct task_struct *p, int policy, |
| @@ -4023,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
| 4023 | * current context has permission. For example, this is needed in | 3478 | * current context has permission. For example, this is needed in |
| 4024 | * stop_machine(): we create temporary high priority worker threads, | 3479 | * stop_machine(): we create temporary high priority worker threads, |
| 4025 | * but our caller might not have that capability. | 3480 | * but our caller might not have that capability. |
| 3481 | * | ||
| 3482 | * Return: 0 on success. An error code otherwise. | ||
| 4026 | */ | 3483 | */ |
| 4027 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3484 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
| 4028 | const struct sched_param *param) | 3485 | const struct sched_param *param) |
| @@ -4057,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
| 4057 | * @pid: the pid in question. | 3514 | * @pid: the pid in question. |
| 4058 | * @policy: new policy. | 3515 | * @policy: new policy. |
| 4059 | * @param: structure containing the new RT priority. | 3516 | * @param: structure containing the new RT priority. |
| 3517 | * | ||
| 3518 | * Return: 0 on success. An error code otherwise. | ||
| 4060 | */ | 3519 | */ |
| 4061 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 3520 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, |
| 4062 | struct sched_param __user *, param) | 3521 | struct sched_param __user *, param) |
| @@ -4072,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | |||
| 4072 | * sys_sched_setparam - set/change the RT priority of a thread | 3531 | * sys_sched_setparam - set/change the RT priority of a thread |
| 4073 | * @pid: the pid in question. | 3532 | * @pid: the pid in question. |
| 4074 | * @param: structure containing the new RT priority. | 3533 | * @param: structure containing the new RT priority. |
| 3534 | * | ||
| 3535 | * Return: 0 on success. An error code otherwise. | ||
| 4075 | */ | 3536 | */ |
| 4076 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | 3537 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
| 4077 | { | 3538 | { |
| @@ -4081,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
| 4081 | /** | 3542 | /** |
| 4082 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3543 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
| 4083 | * @pid: the pid in question. | 3544 | * @pid: the pid in question. |
| 3545 | * | ||
| 3546 | * Return: On success, the policy of the thread. Otherwise, a negative error | ||
| 3547 | * code. | ||
| 4084 | */ | 3548 | */ |
| 4085 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | 3549 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) |
| 4086 | { | 3550 | { |
| @@ -4107,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
| 4107 | * sys_sched_getparam - get the RT priority of a thread | 3571 | * sys_sched_getparam - get the RT priority of a thread |
| 4108 | * @pid: the pid in question. | 3572 | * @pid: the pid in question. |
| 4109 | * @param: structure containing the RT priority. | 3573 | * @param: structure containing the RT priority. |
| 3574 | * | ||
| 3575 | * Return: On success, 0 and the RT priority is in @param. Otherwise, an error | ||
| 3576 | * code. | ||
| 4110 | */ | 3577 | */ |
| 4111 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | 3578 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
| 4112 | { | 3579 | { |
| @@ -4231,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | |||
| 4231 | * @pid: pid of the process | 3698 | * @pid: pid of the process |
| 4232 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3699 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
| 4233 | * @user_mask_ptr: user-space pointer to the new cpu mask | 3700 | * @user_mask_ptr: user-space pointer to the new cpu mask |
| 3701 | * | ||
| 3702 | * Return: 0 on success. An error code otherwise. | ||
| 4234 | */ | 3703 | */ |
| 4235 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, | 3704 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, |
| 4236 | unsigned long __user *, user_mask_ptr) | 3705 | unsigned long __user *, user_mask_ptr) |
| @@ -4282,6 +3751,8 @@ out_unlock: | |||
| 4282 | * @pid: pid of the process | 3751 | * @pid: pid of the process |
| 4283 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3752 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
| 4284 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 3753 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
| 3754 | * | ||
| 3755 | * Return: 0 on success. An error code otherwise. | ||
| 4285 | */ | 3756 | */ |
| 4286 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 3757 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
| 4287 | unsigned long __user *, user_mask_ptr) | 3758 | unsigned long __user *, user_mask_ptr) |
| @@ -4316,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
| 4316 | * | 3787 | * |
| 4317 | * This function yields the current CPU to other tasks. If there are no | 3788 | * This function yields the current CPU to other tasks. If there are no |
| 4318 | * other threads running on this CPU then this function will return. | 3789 | * other threads running on this CPU then this function will return. |
| 3790 | * | ||
| 3791 | * Return: 0. | ||
| 4319 | */ | 3792 | */ |
| 4320 | SYSCALL_DEFINE0(sched_yield) | 3793 | SYSCALL_DEFINE0(sched_yield) |
| 4321 | { | 3794 | { |
| @@ -4441,7 +3914,7 @@ EXPORT_SYMBOL(yield); | |||
| 4441 | * It's the caller's job to ensure that the target task struct | 3914 | * It's the caller's job to ensure that the target task struct |
| 4442 | * can't go away on us before we can do any checks. | 3915 | * can't go away on us before we can do any checks. |
| 4443 | * | 3916 | * |
| 4444 | * Returns: | 3917 | * Return: |
| 4445 | * true (>0) if we indeed boosted the target task. | 3918 | * true (>0) if we indeed boosted the target task. |
| 4446 | * false (0) if we failed to boost the target. | 3919 | * false (0) if we failed to boost the target. |
| 4447 | * -ESRCH if there's no task to yield to. | 3920 | * -ESRCH if there's no task to yield to. |
| @@ -4544,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout) | |||
| 4544 | * sys_sched_get_priority_max - return maximum RT priority. | 4017 | * sys_sched_get_priority_max - return maximum RT priority. |
| 4545 | * @policy: scheduling class. | 4018 | * @policy: scheduling class. |
| 4546 | * | 4019 | * |
| 4547 | * this syscall returns the maximum rt_priority that can be used | 4020 | * Return: On success, this syscall returns the maximum |
| 4548 | * by a given scheduling class. | 4021 | * rt_priority that can be used by a given scheduling class. |
| 4022 | * On failure, a negative error code is returned. | ||
| 4549 | */ | 4023 | */ |
| 4550 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | 4024 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) |
| 4551 | { | 4025 | { |
| @@ -4569,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
| 4569 | * sys_sched_get_priority_min - return minimum RT priority. | 4043 | * sys_sched_get_priority_min - return minimum RT priority. |
| 4570 | * @policy: scheduling class. | 4044 | * @policy: scheduling class. |
| 4571 | * | 4045 | * |
| 4572 | * this syscall returns the minimum rt_priority that can be used | 4046 | * Return: On success, this syscall returns the minimum |
| 4573 | * by a given scheduling class. | 4047 | * rt_priority that can be used by a given scheduling class. |
| 4048 | * On failure, a negative error code is returned. | ||
| 4574 | */ | 4049 | */ |
| 4575 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | 4050 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) |
| 4576 | { | 4051 | { |
| @@ -4596,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
| 4596 | * | 4071 | * |
| 4597 | * this syscall writes the default timeslice value of a given process | 4072 | * this syscall writes the default timeslice value of a given process |
| 4598 | * into the user-space timespec buffer. A value of '0' means infinity. | 4073 | * into the user-space timespec buffer. A value of '0' means infinity. |
| 4074 | * | ||
| 4075 | * Return: On success, 0 and the timeslice is in @interval. Otherwise, | ||
| 4076 | * an error code. | ||
| 4599 | */ | 4077 | */ |
| 4600 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | 4078 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
| 4601 | struct timespec __user *, interval) | 4079 | struct timespec __user *, interval) |
| @@ -4705,7 +4183,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4705 | debug_show_all_locks(); | 4183 | debug_show_all_locks(); |
| 4706 | } | 4184 | } |
| 4707 | 4185 | ||
| 4708 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 4186 | void init_idle_bootup_task(struct task_struct *idle) |
| 4709 | { | 4187 | { |
| 4710 | idle->sched_class = &idle_sched_class; | 4188 | idle->sched_class = &idle_sched_class; |
| 4711 | } | 4189 | } |
| @@ -4718,7 +4196,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle) | |||
| 4718 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4196 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
| 4719 | * flag, to make booting more robust. | 4197 | * flag, to make booting more robust. |
| 4720 | */ | 4198 | */ |
| 4721 | void __cpuinit init_idle(struct task_struct *idle, int cpu) | 4199 | void init_idle(struct task_struct *idle, int cpu) |
| 4722 | { | 4200 | { |
| 4723 | struct rq *rq = cpu_rq(cpu); | 4201 | struct rq *rq = cpu_rq(cpu); |
| 4724 | unsigned long flags; | 4202 | unsigned long flags; |
| @@ -4960,6 +4438,13 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
| 4960 | */ | 4438 | */ |
| 4961 | rq->stop = NULL; | 4439 | rq->stop = NULL; |
| 4962 | 4440 | ||
| 4441 | /* | ||
| 4442 | * put_prev_task() and pick_next_task() sched | ||
| 4443 | * class method both need to have an up-to-date | ||
| 4444 | * value of rq->clock[_task] | ||
| 4445 | */ | ||
| 4446 | update_rq_clock(rq); | ||
| 4447 | |||
| 4963 | for ( ; ; ) { | 4448 | for ( ; ; ) { |
| 4964 | /* | 4449 | /* |
| 4965 | * There's this thread running, bail when that's the only | 4450 | * There's this thread running, bail when that's the only |
| @@ -5093,7 +4578,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 5093 | return table; | 4578 | return table; |
| 5094 | } | 4579 | } |
| 5095 | 4580 | ||
| 5096 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | 4581 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
| 5097 | { | 4582 | { |
| 5098 | struct ctl_table *entry, *table; | 4583 | struct ctl_table *entry, *table; |
| 5099 | struct sched_domain *sd; | 4584 | struct sched_domain *sd; |
| @@ -5195,7 +4680,7 @@ static void set_rq_offline(struct rq *rq) | |||
| 5195 | * migration_call - callback that gets triggered when a CPU is added. | 4680 | * migration_call - callback that gets triggered when a CPU is added. |
| 5196 | * Here we can start up the necessary migration thread for the new CPU. | 4681 | * Here we can start up the necessary migration thread for the new CPU. |
| 5197 | */ | 4682 | */ |
| 5198 | static int __cpuinit | 4683 | static int |
| 5199 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 4684 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| 5200 | { | 4685 | { |
| 5201 | int cpu = (long)hcpu; | 4686 | int cpu = (long)hcpu; |
| @@ -5249,12 +4734,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5249 | * happens before everything else. This has to be lower priority than | 4734 | * happens before everything else. This has to be lower priority than |
| 5250 | * the notifier in the perf_event subsystem, though. | 4735 | * the notifier in the perf_event subsystem, though. |
| 5251 | */ | 4736 | */ |
| 5252 | static struct notifier_block __cpuinitdata migration_notifier = { | 4737 | static struct notifier_block migration_notifier = { |
| 5253 | .notifier_call = migration_call, | 4738 | .notifier_call = migration_call, |
| 5254 | .priority = CPU_PRI_MIGRATION, | 4739 | .priority = CPU_PRI_MIGRATION, |
| 5255 | }; | 4740 | }; |
| 5256 | 4741 | ||
| 5257 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | 4742 | static int sched_cpu_active(struct notifier_block *nfb, |
| 5258 | unsigned long action, void *hcpu) | 4743 | unsigned long action, void *hcpu) |
| 5259 | { | 4744 | { |
| 5260 | switch (action & ~CPU_TASKS_FROZEN) { | 4745 | switch (action & ~CPU_TASKS_FROZEN) { |
| @@ -5267,7 +4752,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | |||
| 5267 | } | 4752 | } |
| 5268 | } | 4753 | } |
| 5269 | 4754 | ||
| 5270 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | 4755 | static int sched_cpu_inactive(struct notifier_block *nfb, |
| 5271 | unsigned long action, void *hcpu) | 4756 | unsigned long action, void *hcpu) |
| 5272 | { | 4757 | { |
| 5273 | switch (action & ~CPU_TASKS_FROZEN) { | 4758 | switch (action & ~CPU_TASKS_FROZEN) { |
| @@ -5907,7 +5392,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5907 | get_group(cpu, sdd, &sd->groups); | 5392 | get_group(cpu, sdd, &sd->groups); |
| 5908 | atomic_inc(&sd->groups->ref); | 5393 | atomic_inc(&sd->groups->ref); |
| 5909 | 5394 | ||
| 5910 | if (cpu != cpumask_first(sched_domain_span(sd))) | 5395 | if (cpu != cpumask_first(span)) |
| 5911 | return 0; | 5396 | return 0; |
| 5912 | 5397 | ||
| 5913 | lockdep_assert_held(&sched_domains_mutex); | 5398 | lockdep_assert_held(&sched_domains_mutex); |
| @@ -5917,12 +5402,12 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5917 | 5402 | ||
| 5918 | for_each_cpu(i, span) { | 5403 | for_each_cpu(i, span) { |
| 5919 | struct sched_group *sg; | 5404 | struct sched_group *sg; |
| 5920 | int group = get_group(i, sdd, &sg); | 5405 | int group, j; |
| 5921 | int j; | ||
| 5922 | 5406 | ||
| 5923 | if (cpumask_test_cpu(i, covered)) | 5407 | if (cpumask_test_cpu(i, covered)) |
| 5924 | continue; | 5408 | continue; |
| 5925 | 5409 | ||
| 5410 | group = get_group(i, sdd, &sg); | ||
| 5926 | cpumask_clear(sched_group_cpus(sg)); | 5411 | cpumask_clear(sched_group_cpus(sg)); |
| 5927 | sg->sgp->power = 0; | 5412 | sg->sgp->power = 0; |
| 5928 | cpumask_setall(sched_group_mask(sg)); | 5413 | cpumask_setall(sched_group_mask(sg)); |
| @@ -5960,7 +5445,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 5960 | { | 5445 | { |
| 5961 | struct sched_group *sg = sd->groups; | 5446 | struct sched_group *sg = sd->groups; |
| 5962 | 5447 | ||
| 5963 | WARN_ON(!sd || !sg); | 5448 | WARN_ON(!sg); |
| 5964 | 5449 | ||
| 5965 | do { | 5450 | do { |
| 5966 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | 5451 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); |
| @@ -6125,6 +5610,9 @@ static struct sched_domain_topology_level default_topology[] = { | |||
| 6125 | 5610 | ||
| 6126 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 5611 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
| 6127 | 5612 | ||
| 5613 | #define for_each_sd_topology(tl) \ | ||
| 5614 | for (tl = sched_domain_topology; tl->init; tl++) | ||
| 5615 | |||
| 6128 | #ifdef CONFIG_NUMA | 5616 | #ifdef CONFIG_NUMA |
| 6129 | 5617 | ||
| 6130 | static int sched_domains_numa_levels; | 5618 | static int sched_domains_numa_levels; |
| @@ -6422,7 +5910,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 6422 | struct sched_domain_topology_level *tl; | 5910 | struct sched_domain_topology_level *tl; |
| 6423 | int j; | 5911 | int j; |
| 6424 | 5912 | ||
| 6425 | for (tl = sched_domain_topology; tl->init; tl++) { | 5913 | for_each_sd_topology(tl) { |
| 6426 | struct sd_data *sdd = &tl->data; | 5914 | struct sd_data *sdd = &tl->data; |
| 6427 | 5915 | ||
| 6428 | sdd->sd = alloc_percpu(struct sched_domain *); | 5916 | sdd->sd = alloc_percpu(struct sched_domain *); |
| @@ -6475,7 +5963,7 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
| 6475 | struct sched_domain_topology_level *tl; | 5963 | struct sched_domain_topology_level *tl; |
| 6476 | int j; | 5964 | int j; |
| 6477 | 5965 | ||
| 6478 | for (tl = sched_domain_topology; tl->init; tl++) { | 5966 | for_each_sd_topology(tl) { |
| 6479 | struct sd_data *sdd = &tl->data; | 5967 | struct sd_data *sdd = &tl->data; |
| 6480 | 5968 | ||
| 6481 | for_each_cpu(j, cpu_map) { | 5969 | for_each_cpu(j, cpu_map) { |
| @@ -6503,9 +5991,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
| 6503 | } | 5991 | } |
| 6504 | 5992 | ||
| 6505 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | 5993 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, |
| 6506 | struct s_data *d, const struct cpumask *cpu_map, | 5994 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 6507 | struct sched_domain_attr *attr, struct sched_domain *child, | 5995 | struct sched_domain *child, int cpu) |
| 6508 | int cpu) | ||
| 6509 | { | 5996 | { |
| 6510 | struct sched_domain *sd = tl->init(tl, cpu); | 5997 | struct sched_domain *sd = tl->init(tl, cpu); |
| 6511 | if (!sd) | 5998 | if (!sd) |
| @@ -6516,8 +6003,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6516 | sd->level = child->level + 1; | 6003 | sd->level = child->level + 1; |
| 6517 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 6004 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
| 6518 | child->parent = sd; | 6005 | child->parent = sd; |
| 6006 | sd->child = child; | ||
| 6519 | } | 6007 | } |
| 6520 | sd->child = child; | ||
| 6521 | set_domain_attribute(sd, attr); | 6008 | set_domain_attribute(sd, attr); |
| 6522 | 6009 | ||
| 6523 | return sd; | 6010 | return sd; |
| @@ -6530,7 +6017,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6530 | static int build_sched_domains(const struct cpumask *cpu_map, | 6017 | static int build_sched_domains(const struct cpumask *cpu_map, |
| 6531 | struct sched_domain_attr *attr) | 6018 | struct sched_domain_attr *attr) |
| 6532 | { | 6019 | { |
| 6533 | enum s_alloc alloc_state = sa_none; | 6020 | enum s_alloc alloc_state; |
| 6534 | struct sched_domain *sd; | 6021 | struct sched_domain *sd; |
| 6535 | struct s_data d; | 6022 | struct s_data d; |
| 6536 | int i, ret = -ENOMEM; | 6023 | int i, ret = -ENOMEM; |
| @@ -6544,18 +6031,15 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
| 6544 | struct sched_domain_topology_level *tl; | 6031 | struct sched_domain_topology_level *tl; |
| 6545 | 6032 | ||
| 6546 | sd = NULL; | 6033 | sd = NULL; |
| 6547 | for (tl = sched_domain_topology; tl->init; tl++) { | 6034 | for_each_sd_topology(tl) { |
| 6548 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); | 6035 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); |
| 6036 | if (tl == sched_domain_topology) | ||
| 6037 | *per_cpu_ptr(d.sd, i) = sd; | ||
| 6549 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | 6038 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) |
| 6550 | sd->flags |= SD_OVERLAP; | 6039 | sd->flags |= SD_OVERLAP; |
| 6551 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | 6040 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) |
| 6552 | break; | 6041 | break; |
| 6553 | } | 6042 | } |
| 6554 | |||
| 6555 | while (sd->child) | ||
| 6556 | sd = sd->child; | ||
| 6557 | |||
| 6558 | *per_cpu_ptr(d.sd, i) = sd; | ||
| 6559 | } | 6043 | } |
| 6560 | 6044 | ||
| 6561 | /* Build the groups for the domains */ | 6045 | /* Build the groups for the domains */ |
| @@ -6867,9 +6351,6 @@ void __init sched_init_smp(void) | |||
| 6867 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6351 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
| 6868 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); | 6352 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
| 6869 | 6353 | ||
| 6870 | /* RT runtime code needs to handle some hotplug events */ | ||
| 6871 | hotcpu_notifier(update_runtime, 0); | ||
| 6872 | |||
| 6873 | init_hrtick(); | 6354 | init_hrtick(); |
| 6874 | 6355 | ||
| 6875 | /* Move init over to a non-isolated CPU */ | 6356 | /* Move init over to a non-isolated CPU */ |
| @@ -7201,6 +6682,8 @@ void normalize_rt_tasks(void) | |||
| 7201 | * @cpu: the processor in question. | 6682 | * @cpu: the processor in question. |
| 7202 | * | 6683 | * |
| 7203 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6684 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 6685 | * | ||
| 6686 | * Return: The current task for @cpu. | ||
| 7204 | */ | 6687 | */ |
| 7205 | struct task_struct *curr_task(int cpu) | 6688 | struct task_struct *curr_task(int cpu) |
| 7206 | { | 6689 | { |
