aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c605
1 files changed, 234 insertions, 371 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d9db3fb17573..a455dca884a6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -119,8 +119,6 @@
119 */ 119 */
120#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
121 121
122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
123
124static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
125{ 123{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
378 376
379#else 377#else
380 378
381#ifdef CONFIG_SMP
382static int root_task_group_empty(void)
383{
384 return 1;
385}
386#endif
387
388static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
389static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
390{ 381{
@@ -514,14 +505,6 @@ struct root_domain {
514#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
515 struct cpupri cpupri; 506 struct cpupri cpupri;
516#endif 507#endif
517#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
518 /*
519 * Preferred wake up cpu nominated by sched_mc balance that will be
520 * used when most cpus are idle in the system indicating overall very
521 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
522 */
523 unsigned int sched_mc_preferred_wakeup_cpu;
524#endif
525}; 508};
526 509
527/* 510/*
@@ -646,9 +629,10 @@ struct rq {
646 629
647static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
648 631
649static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
650{ 634{
651 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
652} 636}
653 637
654static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -692,20 +676,15 @@ inline void update_rq_clock(struct rq *rq)
692 676
693/** 677/**
694 * runqueue_is_locked 678 * runqueue_is_locked
679 * @cpu: the processor in question.
695 * 680 *
696 * Returns true if the current cpu runqueue is locked. 681 * Returns true if the current cpu runqueue is locked.
697 * This interface allows printk to be called with the runqueue lock 682 * This interface allows printk to be called with the runqueue lock
698 * held and know whether or not it is OK to wake up the klogd. 683 * held and know whether or not it is OK to wake up the klogd.
699 */ 684 */
700int runqueue_is_locked(void) 685int runqueue_is_locked(int cpu)
701{ 686{
702 int cpu = get_cpu(); 687 return spin_is_locked(&cpu_rq(cpu)->lock);
703 struct rq *rq = cpu_rq(cpu);
704 int ret;
705
706 ret = spin_is_locked(&rq->lock);
707 put_cpu();
708 return ret;
709} 688}
710 689
711/* 690/*
@@ -802,7 +781,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
802 return single_open(filp, sched_feat_show, NULL); 781 return single_open(filp, sched_feat_show, NULL);
803} 782}
804 783
805static struct file_operations sched_feat_fops = { 784static const struct file_operations sched_feat_fops = {
806 .open = sched_feat_open, 785 .open = sched_feat_open,
807 .write = sched_feat_write, 786 .write = sched_feat_write,
808 .read = seq_read, 787 .read = seq_read,
@@ -1509,8 +1488,65 @@ static int tg_nop(struct task_group *tg, void *data)
1509#endif 1488#endif
1510 1489
1511#ifdef CONFIG_SMP 1490#ifdef CONFIG_SMP
1512static unsigned long source_load(int cpu, int type); 1491/* Used instead of source_load when we know the type == 0 */
1513static unsigned long target_load(int cpu, int type); 1492static unsigned long weighted_cpuload(const int cpu)
1493{
1494 return cpu_rq(cpu)->load.weight;
1495}
1496
1497/*
1498 * Return a low guess at the load of a migration-source cpu weighted
1499 * according to the scheduling class and "nice" value.
1500 *
1501 * We want to under-estimate the load of migration sources, to
1502 * balance conservatively.
1503 */
1504static unsigned long source_load(int cpu, int type)
1505{
1506 struct rq *rq = cpu_rq(cpu);
1507 unsigned long total = weighted_cpuload(cpu);
1508
1509 if (type == 0 || !sched_feat(LB_BIAS))
1510 return total;
1511
1512 return min(rq->cpu_load[type-1], total);
1513}
1514
1515/*
1516 * Return a high guess at the load of a migration-target cpu weighted
1517 * according to the scheduling class and "nice" value.
1518 */
1519static unsigned long target_load(int cpu, int type)
1520{
1521 struct rq *rq = cpu_rq(cpu);
1522 unsigned long total = weighted_cpuload(cpu);
1523
1524 if (type == 0 || !sched_feat(LB_BIAS))
1525 return total;
1526
1527 return max(rq->cpu_load[type-1], total);
1528}
1529
1530static struct sched_group *group_of(int cpu)
1531{
1532 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1533
1534 if (!sd)
1535 return NULL;
1536
1537 return sd->groups;
1538}
1539
1540static unsigned long power_of(int cpu)
1541{
1542 struct sched_group *group = group_of(cpu);
1543
1544 if (!group)
1545 return SCHED_LOAD_SCALE;
1546
1547 return group->cpu_power;
1548}
1549
1514static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1550static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1515 1551
1516static unsigned long cpu_avg_load_per_task(int cpu) 1552static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1528,11 +1564,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1528 1564
1529#ifdef CONFIG_FAIR_GROUP_SCHED 1565#ifdef CONFIG_FAIR_GROUP_SCHED
1530 1566
1531struct update_shares_data { 1567static __read_mostly unsigned long *update_shares_data;
1532 unsigned long rq_weight[NR_CPUS];
1533};
1534
1535static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1536 1568
1537static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1569static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1538 1570
@@ -1542,12 +1574,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1542static void update_group_shares_cpu(struct task_group *tg, int cpu, 1574static void update_group_shares_cpu(struct task_group *tg, int cpu,
1543 unsigned long sd_shares, 1575 unsigned long sd_shares,
1544 unsigned long sd_rq_weight, 1576 unsigned long sd_rq_weight,
1545 struct update_shares_data *usd) 1577 unsigned long *usd_rq_weight)
1546{ 1578{
1547 unsigned long shares, rq_weight; 1579 unsigned long shares, rq_weight;
1548 int boost = 0; 1580 int boost = 0;
1549 1581
1550 rq_weight = usd->rq_weight[cpu]; 1582 rq_weight = usd_rq_weight[cpu];
1551 if (!rq_weight) { 1583 if (!rq_weight) {
1552 boost = 1; 1584 boost = 1;
1553 rq_weight = NICE_0_LOAD; 1585 rq_weight = NICE_0_LOAD;
@@ -1582,7 +1614,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1582static int tg_shares_up(struct task_group *tg, void *data) 1614static int tg_shares_up(struct task_group *tg, void *data)
1583{ 1615{
1584 unsigned long weight, rq_weight = 0, shares = 0; 1616 unsigned long weight, rq_weight = 0, shares = 0;
1585 struct update_shares_data *usd; 1617 unsigned long *usd_rq_weight;
1586 struct sched_domain *sd = data; 1618 struct sched_domain *sd = data;
1587 unsigned long flags; 1619 unsigned long flags;
1588 int i; 1620 int i;
@@ -1591,11 +1623,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
1591 return 0; 1623 return 0;
1592 1624
1593 local_irq_save(flags); 1625 local_irq_save(flags);
1594 usd = &__get_cpu_var(update_shares_data); 1626 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1595 1627
1596 for_each_cpu(i, sched_domain_span(sd)) { 1628 for_each_cpu(i, sched_domain_span(sd)) {
1597 weight = tg->cfs_rq[i]->load.weight; 1629 weight = tg->cfs_rq[i]->load.weight;
1598 usd->rq_weight[i] = weight; 1630 usd_rq_weight[i] = weight;
1599 1631
1600 /* 1632 /*
1601 * If there are currently no tasks on the cpu pretend there 1633 * If there are currently no tasks on the cpu pretend there
@@ -1616,7 +1648,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1616 shares = tg->shares; 1648 shares = tg->shares;
1617 1649
1618 for_each_cpu(i, sched_domain_span(sd)) 1650 for_each_cpu(i, sched_domain_span(sd))
1619 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1651 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1620 1652
1621 local_irq_restore(flags); 1653 local_irq_restore(flags);
1622 1654
@@ -1695,6 +1727,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1695 1727
1696#ifdef CONFIG_PREEMPT 1728#ifdef CONFIG_PREEMPT
1697 1729
1730static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1731
1698/* 1732/*
1699 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1733 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1700 * way at the expense of forcing extra atomic operations in all 1734 * way at the expense of forcing extra atomic operations in all
@@ -1959,13 +1993,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1959} 1993}
1960 1994
1961#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
1962
1963/* Used instead of source_load when we know the type == 0 */
1964static unsigned long weighted_cpuload(const int cpu)
1965{
1966 return cpu_rq(cpu)->load.weight;
1967}
1968
1969/* 1996/*
1970 * Is this task likely cache-hot: 1997 * Is this task likely cache-hot:
1971 */ 1998 */
@@ -2023,7 +2050,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2023 if (task_hot(p, old_rq->clock, NULL)) 2050 if (task_hot(p, old_rq->clock, NULL))
2024 schedstat_inc(p, se.nr_forced2_migrations); 2051 schedstat_inc(p, se.nr_forced2_migrations);
2025#endif 2052#endif
2026 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2053 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2027 1, 1, NULL, 0); 2054 1, 1, NULL, 0);
2028 } 2055 }
2029 p->se.vruntime -= old_cfsrq->min_vruntime - 2056 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2239,185 +2266,6 @@ void kick_process(struct task_struct *p)
2239 preempt_enable(); 2266 preempt_enable();
2240} 2267}
2241EXPORT_SYMBOL_GPL(kick_process); 2268EXPORT_SYMBOL_GPL(kick_process);
2242
2243/*
2244 * Return a low guess at the load of a migration-source cpu weighted
2245 * according to the scheduling class and "nice" value.
2246 *
2247 * We want to under-estimate the load of migration sources, to
2248 * balance conservatively.
2249 */
2250static unsigned long source_load(int cpu, int type)
2251{
2252 struct rq *rq = cpu_rq(cpu);
2253 unsigned long total = weighted_cpuload(cpu);
2254
2255 if (type == 0 || !sched_feat(LB_BIAS))
2256 return total;
2257
2258 return min(rq->cpu_load[type-1], total);
2259}
2260
2261/*
2262 * Return a high guess at the load of a migration-target cpu weighted
2263 * according to the scheduling class and "nice" value.
2264 */
2265static unsigned long target_load(int cpu, int type)
2266{
2267 struct rq *rq = cpu_rq(cpu);
2268 unsigned long total = weighted_cpuload(cpu);
2269
2270 if (type == 0 || !sched_feat(LB_BIAS))
2271 return total;
2272
2273 return max(rq->cpu_load[type-1], total);
2274}
2275
2276/*
2277 * find_idlest_group finds and returns the least busy CPU group within the
2278 * domain.
2279 */
2280static struct sched_group *
2281find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2282{
2283 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2284 unsigned long min_load = ULONG_MAX, this_load = 0;
2285 int load_idx = sd->forkexec_idx;
2286 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2287
2288 do {
2289 unsigned long load, avg_load;
2290 int local_group;
2291 int i;
2292
2293 /* Skip over this group if it has no CPUs allowed */
2294 if (!cpumask_intersects(sched_group_cpus(group),
2295 &p->cpus_allowed))
2296 continue;
2297
2298 local_group = cpumask_test_cpu(this_cpu,
2299 sched_group_cpus(group));
2300
2301 /* Tally up the load of all CPUs in the group */
2302 avg_load = 0;
2303
2304 for_each_cpu(i, sched_group_cpus(group)) {
2305 /* Bias balancing toward cpus of our domain */
2306 if (local_group)
2307 load = source_load(i, load_idx);
2308 else
2309 load = target_load(i, load_idx);
2310
2311 avg_load += load;
2312 }
2313
2314 /* Adjust by relative CPU power of the group */
2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2316
2317 if (local_group) {
2318 this_load = avg_load;
2319 this = group;
2320 } else if (avg_load < min_load) {
2321 min_load = avg_load;
2322 idlest = group;
2323 }
2324 } while (group = group->next, group != sd->groups);
2325
2326 if (!idlest || 100*this_load < imbalance*min_load)
2327 return NULL;
2328 return idlest;
2329}
2330
2331/*
2332 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2333 */
2334static int
2335find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2336{
2337 unsigned long load, min_load = ULONG_MAX;
2338 int idlest = -1;
2339 int i;
2340
2341 /* Traverse only the allowed CPUs */
2342 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2343 load = weighted_cpuload(i);
2344
2345 if (load < min_load || (load == min_load && i == this_cpu)) {
2346 min_load = load;
2347 idlest = i;
2348 }
2349 }
2350
2351 return idlest;
2352}
2353
2354/*
2355 * sched_balance_self: balance the current task (running on cpu) in domains
2356 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2357 * SD_BALANCE_EXEC.
2358 *
2359 * Balance, ie. select the least loaded group.
2360 *
2361 * Returns the target CPU number, or the same CPU if no balancing is needed.
2362 *
2363 * preempt must be disabled.
2364 */
2365static int sched_balance_self(int cpu, int flag)
2366{
2367 struct task_struct *t = current;
2368 struct sched_domain *tmp, *sd = NULL;
2369
2370 for_each_domain(cpu, tmp) {
2371 /*
2372 * If power savings logic is enabled for a domain, stop there.
2373 */
2374 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2375 break;
2376 if (tmp->flags & flag)
2377 sd = tmp;
2378 }
2379
2380 if (sd)
2381 update_shares(sd);
2382
2383 while (sd) {
2384 struct sched_group *group;
2385 int new_cpu, weight;
2386
2387 if (!(sd->flags & flag)) {
2388 sd = sd->child;
2389 continue;
2390 }
2391
2392 group = find_idlest_group(sd, t, cpu);
2393 if (!group) {
2394 sd = sd->child;
2395 continue;
2396 }
2397
2398 new_cpu = find_idlest_cpu(group, t, cpu);
2399 if (new_cpu == -1 || new_cpu == cpu) {
2400 /* Now try balancing at a lower domain level of cpu */
2401 sd = sd->child;
2402 continue;
2403 }
2404
2405 /* Now try balancing at a lower domain level of new_cpu */
2406 cpu = new_cpu;
2407 weight = cpumask_weight(sched_domain_span(sd));
2408 sd = NULL;
2409 for_each_domain(cpu, tmp) {
2410 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2411 break;
2412 if (tmp->flags & flag)
2413 sd = tmp;
2414 }
2415 /* while loop will break here if sd == NULL */
2416 }
2417
2418 return cpu;
2419}
2420
2421#endif /* CONFIG_SMP */ 2269#endif /* CONFIG_SMP */
2422 2270
2423/** 2271/**
@@ -2455,37 +2303,22 @@ void task_oncpu_function_call(struct task_struct *p,
2455 * 2303 *
2456 * returns failure only if the task is already active. 2304 * returns failure only if the task is already active.
2457 */ 2305 */
2458static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2306static int try_to_wake_up(struct task_struct *p, unsigned int state,
2307 int wake_flags)
2459{ 2308{
2460 int cpu, orig_cpu, this_cpu, success = 0; 2309 int cpu, orig_cpu, this_cpu, success = 0;
2461 unsigned long flags; 2310 unsigned long flags;
2462 long old_state; 2311 struct rq *rq, *orig_rq;
2463 struct rq *rq;
2464 2312
2465 if (!sched_feat(SYNC_WAKEUPS)) 2313 if (!sched_feat(SYNC_WAKEUPS))
2466 sync = 0; 2314 wake_flags &= ~WF_SYNC;
2467
2468#ifdef CONFIG_SMP
2469 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2470 struct sched_domain *sd;
2471
2472 this_cpu = raw_smp_processor_id();
2473 cpu = task_cpu(p);
2474 2315
2475 for_each_domain(this_cpu, sd) { 2316 this_cpu = get_cpu();
2476 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2477 update_shares(sd);
2478 break;
2479 }
2480 }
2481 }
2482#endif
2483 2317
2484 smp_wmb(); 2318 smp_wmb();
2485 rq = task_rq_lock(p, &flags); 2319 rq = orig_rq = task_rq_lock(p, &flags);
2486 update_rq_clock(rq); 2320 update_rq_clock(rq);
2487 old_state = p->state; 2321 if (!(p->state & state))
2488 if (!(old_state & state))
2489 goto out; 2322 goto out;
2490 2323
2491 if (p->se.on_rq) 2324 if (p->se.on_rq)
@@ -2493,27 +2326,33 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2493 2326
2494 cpu = task_cpu(p); 2327 cpu = task_cpu(p);
2495 orig_cpu = cpu; 2328 orig_cpu = cpu;
2496 this_cpu = smp_processor_id();
2497 2329
2498#ifdef CONFIG_SMP 2330#ifdef CONFIG_SMP
2499 if (unlikely(task_running(rq, p))) 2331 if (unlikely(task_running(rq, p)))
2500 goto out_activate; 2332 goto out_activate;
2501 2333
2502 cpu = p->sched_class->select_task_rq(p, sync); 2334 /*
2503 if (cpu != orig_cpu) { 2335 * In order to handle concurrent wakeups and release the rq->lock
2336 * we put the task in TASK_WAKING state.
2337 *
2338 * First fix up the nr_uninterruptible count:
2339 */
2340 if (task_contributes_to_load(p))
2341 rq->nr_uninterruptible--;
2342 p->state = TASK_WAKING;
2343 task_rq_unlock(rq, &flags);
2344
2345 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2346 if (cpu != orig_cpu)
2504 set_task_cpu(p, cpu); 2347 set_task_cpu(p, cpu);
2505 task_rq_unlock(rq, &flags);
2506 /* might preempt at this point */
2507 rq = task_rq_lock(p, &flags);
2508 old_state = p->state;
2509 if (!(old_state & state))
2510 goto out;
2511 if (p->se.on_rq)
2512 goto out_running;
2513 2348
2514 this_cpu = smp_processor_id(); 2349 rq = task_rq_lock(p, &flags);
2515 cpu = task_cpu(p); 2350
2516 } 2351 if (rq != orig_rq)
2352 update_rq_clock(rq);
2353
2354 WARN_ON(p->state != TASK_WAKING);
2355 cpu = task_cpu(p);
2517 2356
2518#ifdef CONFIG_SCHEDSTATS 2357#ifdef CONFIG_SCHEDSTATS
2519 schedstat_inc(rq, ttwu_count); 2358 schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2372,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2533out_activate: 2372out_activate:
2534#endif /* CONFIG_SMP */ 2373#endif /* CONFIG_SMP */
2535 schedstat_inc(p, se.nr_wakeups); 2374 schedstat_inc(p, se.nr_wakeups);
2536 if (sync) 2375 if (wake_flags & WF_SYNC)
2537 schedstat_inc(p, se.nr_wakeups_sync); 2376 schedstat_inc(p, se.nr_wakeups_sync);
2538 if (orig_cpu != cpu) 2377 if (orig_cpu != cpu)
2539 schedstat_inc(p, se.nr_wakeups_migrate); 2378 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2401,7 @@ out_activate:
2562 2401
2563out_running: 2402out_running:
2564 trace_sched_wakeup(rq, p, success); 2403 trace_sched_wakeup(rq, p, success);
2565 check_preempt_curr(rq, p, sync); 2404 check_preempt_curr(rq, p, wake_flags);
2566 2405
2567 p->state = TASK_RUNNING; 2406 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP 2407#ifdef CONFIG_SMP
@@ -2571,6 +2410,7 @@ out_running:
2571#endif 2410#endif
2572out: 2411out:
2573 task_rq_unlock(rq, &flags); 2412 task_rq_unlock(rq, &flags);
2413 put_cpu();
2574 2414
2575 return success; 2415 return success;
2576} 2416}
@@ -2613,6 +2453,7 @@ static void __sched_fork(struct task_struct *p)
2613 p->se.avg_overlap = 0; 2453 p->se.avg_overlap = 0;
2614 p->se.start_runtime = 0; 2454 p->se.start_runtime = 0;
2615 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2455 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2456 p->se.avg_running = 0;
2616 2457
2617#ifdef CONFIG_SCHEDSTATS 2458#ifdef CONFIG_SCHEDSTATS
2618 p->se.wait_start = 0; 2459 p->se.wait_start = 0;
@@ -2674,28 +2515,18 @@ void sched_fork(struct task_struct *p, int clone_flags)
2674 2515
2675 __sched_fork(p); 2516 __sched_fork(p);
2676 2517
2677#ifdef CONFIG_SMP
2678 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2679#endif
2680 set_task_cpu(p, cpu);
2681
2682 /*
2683 * Make sure we do not leak PI boosting priority to the child.
2684 */
2685 p->prio = current->normal_prio;
2686
2687 /* 2518 /*
2688 * Revert to default priority/policy on fork if requested. 2519 * Revert to default priority/policy on fork if requested.
2689 */ 2520 */
2690 if (unlikely(p->sched_reset_on_fork)) { 2521 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2522 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2692 p->policy = SCHED_NORMAL; 2523 p->policy = SCHED_NORMAL;
2693 2524 p->normal_prio = p->static_prio;
2694 if (p->normal_prio < DEFAULT_PRIO) 2525 }
2695 p->prio = DEFAULT_PRIO;
2696 2526
2697 if (PRIO_TO_NICE(p->static_prio) < 0) { 2527 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0); 2528 p->static_prio = NICE_TO_PRIO(0);
2529 p->normal_prio = p->static_prio;
2699 set_load_weight(p); 2530 set_load_weight(p);
2700 } 2531 }
2701 2532
@@ -2706,9 +2537,19 @@ void sched_fork(struct task_struct *p, int clone_flags)
2706 p->sched_reset_on_fork = 0; 2537 p->sched_reset_on_fork = 0;
2707 } 2538 }
2708 2539
2540 /*
2541 * Make sure we do not leak PI boosting priority to the child.
2542 */
2543 p->prio = current->normal_prio;
2544
2709 if (!rt_prio(p->prio)) 2545 if (!rt_prio(p->prio))
2710 p->sched_class = &fair_sched_class; 2546 p->sched_class = &fair_sched_class;
2711 2547
2548#ifdef CONFIG_SMP
2549 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2550#endif
2551 set_task_cpu(p, cpu);
2552
2712#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2553#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2713 if (likely(sched_info_on())) 2554 if (likely(sched_info_on()))
2714 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2555 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2741,8 +2582,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2741 BUG_ON(p->state != TASK_RUNNING); 2582 BUG_ON(p->state != TASK_RUNNING);
2742 update_rq_clock(rq); 2583 update_rq_clock(rq);
2743 2584
2744 p->prio = effective_prio(p);
2745
2746 if (!p->sched_class->task_new || !current->se.on_rq) { 2585 if (!p->sched_class->task_new || !current->se.on_rq) {
2747 activate_task(rq, p, 0); 2586 activate_task(rq, p, 0);
2748 } else { 2587 } else {
@@ -2754,7 +2593,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2754 inc_nr_running(rq); 2593 inc_nr_running(rq);
2755 } 2594 }
2756 trace_sched_wakeup_new(rq, p, 1); 2595 trace_sched_wakeup_new(rq, p, 1);
2757 check_preempt_curr(rq, p, 0); 2596 check_preempt_curr(rq, p, WF_FORK);
2758#ifdef CONFIG_SMP 2597#ifdef CONFIG_SMP
2759 if (p->sched_class->task_wake_up) 2598 if (p->sched_class->task_wake_up)
2760 p->sched_class->task_wake_up(rq, p); 2599 p->sched_class->task_wake_up(rq, p);
@@ -2878,7 +2717,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2878 */ 2717 */
2879 prev_state = prev->state; 2718 prev_state = prev->state;
2880 finish_arch_switch(prev); 2719 finish_arch_switch(prev);
2881 perf_counter_task_sched_in(current, cpu_of(rq)); 2720 perf_event_task_sched_in(current, cpu_of(rq));
2882 finish_lock_switch(rq, prev); 2721 finish_lock_switch(rq, prev);
2883 2722
2884 fire_sched_in_preempt_notifiers(current); 2723 fire_sched_in_preempt_notifiers(current);
@@ -3064,6 +2903,19 @@ unsigned long nr_iowait(void)
3064 return sum; 2903 return sum;
3065} 2904}
3066 2905
2906unsigned long nr_iowait_cpu(void)
2907{
2908 struct rq *this = this_rq();
2909 return atomic_read(&this->nr_iowait);
2910}
2911
2912unsigned long this_cpu_load(void)
2913{
2914 struct rq *this = this_rq();
2915 return this->cpu_load[0];
2916}
2917
2918
3067/* Variables and functions for calc_load */ 2919/* Variables and functions for calc_load */
3068static atomic_long_t calc_load_tasks; 2920static atomic_long_t calc_load_tasks;
3069static unsigned long calc_load_update; 2921static unsigned long calc_load_update;
@@ -3263,7 +3115,7 @@ out:
3263void sched_exec(void) 3115void sched_exec(void)
3264{ 3116{
3265 int new_cpu, this_cpu = get_cpu(); 3117 int new_cpu, this_cpu = get_cpu();
3266 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3118 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3267 put_cpu(); 3119 put_cpu();
3268 if (new_cpu != this_cpu) 3120 if (new_cpu != this_cpu)
3269 sched_migrate_task(current, new_cpu); 3121 sched_migrate_task(current, new_cpu);
@@ -3683,11 +3535,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3683 *imbalance = sds->min_load_per_task; 3535 *imbalance = sds->min_load_per_task;
3684 sds->busiest = sds->group_min; 3536 sds->busiest = sds->group_min;
3685 3537
3686 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3687 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3688 group_first_cpu(sds->group_leader);
3689 }
3690
3691 return 1; 3538 return 1;
3692 3539
3693} 3540}
@@ -3711,7 +3558,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3711} 3558}
3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3559#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3713 3560
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3561
3562unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3563{
3564 return SCHED_LOAD_SCALE;
3565}
3566
3567unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3568{
3569 return default_scale_freq_power(sd, cpu);
3570}
3571
3572unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3715{ 3573{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3574 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain; 3575 unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3579,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3721 return smt_gain; 3579 return smt_gain;
3722} 3580}
3723 3581
3582unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3583{
3584 return default_scale_smt_power(sd, cpu);
3585}
3586
3724unsigned long scale_rt_power(int cpu) 3587unsigned long scale_rt_power(int cpu)
3725{ 3588{
3726 struct rq *rq = cpu_rq(cpu); 3589 struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3608,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3745 unsigned long power = SCHED_LOAD_SCALE; 3608 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups; 3609 struct sched_group *sdg = sd->groups;
3747 3610
3748 /* here we could scale based on cpufreq */ 3611 if (sched_feat(ARCH_POWER))
3612 power *= arch_scale_freq_power(sd, cpu);
3613 else
3614 power *= default_scale_freq_power(sd, cpu);
3615
3616 power >>= SCHED_LOAD_SHIFT;
3749 3617
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3618 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu); 3619 if (sched_feat(ARCH_POWER))
3620 power *= arch_scale_smt_power(sd, cpu);
3621 else
3622 power *= default_scale_smt_power(sd, cpu);
3623
3752 power >>= SCHED_LOAD_SHIFT; 3624 power >>= SCHED_LOAD_SHIFT;
3753 } 3625 }
3754 3626
@@ -3785,6 +3657,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3785 3657
3786/** 3658/**
3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3659 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3660 * @sd: The sched_domain whose statistics are to be updated.
3788 * @group: sched_group whose statistics are to be updated. 3661 * @group: sched_group whose statistics are to be updated.
3789 * @this_cpu: Cpu for which load balance is currently performed. 3662 * @this_cpu: Cpu for which load balance is currently performed.
3790 * @idle: Idle status of this_cpu 3663 * @idle: Idle status of this_cpu
@@ -4161,26 +4034,6 @@ ret:
4161 return NULL; 4034 return NULL;
4162} 4035}
4163 4036
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
4184/* 4037/*
4185 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4038 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4186 */ 4039 */
@@ -5239,17 +5092,16 @@ void account_idle_time(cputime_t cputime)
5239 */ 5092 */
5240void account_process_tick(struct task_struct *p, int user_tick) 5093void account_process_tick(struct task_struct *p, int user_tick)
5241{ 5094{
5242 cputime_t one_jiffy = jiffies_to_cputime(1); 5095 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5243 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5244 struct rq *rq = this_rq(); 5096 struct rq *rq = this_rq();
5245 5097
5246 if (user_tick) 5098 if (user_tick)
5247 account_user_time(p, one_jiffy, one_jiffy_scaled); 5099 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5248 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5100 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5249 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5101 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5250 one_jiffy_scaled); 5102 one_jiffy_scaled);
5251 else 5103 else
5252 account_idle_time(one_jiffy); 5104 account_idle_time(cputime_one_jiffy);
5253} 5105}
5254 5106
5255/* 5107/*
@@ -5353,7 +5205,7 @@ void scheduler_tick(void)
5353 curr->sched_class->task_tick(rq, curr, 0); 5205 curr->sched_class->task_tick(rq, curr, 0);
5354 spin_unlock(&rq->lock); 5206 spin_unlock(&rq->lock);
5355 5207
5356 perf_counter_task_tick(curr, cpu); 5208 perf_event_task_tick(curr, cpu);
5357 5209
5358#ifdef CONFIG_SMP 5210#ifdef CONFIG_SMP
5359 rq->idle_at_tick = idle_cpu(cpu); 5211 rq->idle_at_tick = idle_cpu(cpu);
@@ -5465,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev)
5465#endif 5317#endif
5466} 5318}
5467 5319
5468static void put_prev_task(struct rq *rq, struct task_struct *prev) 5320static void put_prev_task(struct rq *rq, struct task_struct *p)
5469{ 5321{
5470 if (prev->state == TASK_RUNNING) { 5322 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5471 u64 runtime = prev->se.sum_exec_runtime;
5472 5323
5473 runtime -= prev->se.prev_sum_exec_runtime; 5324 update_avg(&p->se.avg_running, runtime);
5474 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5475 5325
5326 if (p->state == TASK_RUNNING) {
5476 /* 5327 /*
5477 * In order to avoid avg_overlap growing stale when we are 5328 * In order to avoid avg_overlap growing stale when we are
5478 * indeed overlapping and hence not getting put to sleep, grow 5329 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5482 * correlates to the amount of cache footprint a task can 5333 * correlates to the amount of cache footprint a task can
5483 * build up. 5334 * build up.
5484 */ 5335 */
5485 update_avg(&prev->se.avg_overlap, runtime); 5336 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5337 update_avg(&p->se.avg_overlap, runtime);
5338 } else {
5339 update_avg(&p->se.avg_running, 0);
5486 } 5340 }
5487 prev->sched_class->put_prev_task(rq, prev); 5341 p->sched_class->put_prev_task(rq, p);
5488} 5342}
5489 5343
5490/* 5344/*
@@ -5567,7 +5421,7 @@ need_resched_nonpreemptible:
5567 5421
5568 if (likely(prev != next)) { 5422 if (likely(prev != next)) {
5569 sched_info_switch(prev, next); 5423 sched_info_switch(prev, next);
5570 perf_counter_task_sched_out(prev, next, cpu); 5424 perf_event_task_sched_out(prev, next, cpu);
5571 5425
5572 rq->nr_switches++; 5426 rq->nr_switches++;
5573 rq->curr = next; 5427 rq->curr = next;
@@ -5716,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5716 5570
5717#endif /* CONFIG_PREEMPT */ 5571#endif /* CONFIG_PREEMPT */
5718 5572
5719int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5573int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5720 void *key) 5574 void *key)
5721{ 5575{
5722 return try_to_wake_up(curr->private, mode, sync); 5576 return try_to_wake_up(curr->private, mode, wake_flags);
5723} 5577}
5724EXPORT_SYMBOL(default_wake_function); 5578EXPORT_SYMBOL(default_wake_function);
5725 5579
@@ -5733,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function);
5733 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5587 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5734 */ 5588 */
5735static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5589static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5736 int nr_exclusive, int sync, void *key) 5590 int nr_exclusive, int wake_flags, void *key)
5737{ 5591{
5738 wait_queue_t *curr, *next; 5592 wait_queue_t *curr, *next;
5739 5593
5740 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5594 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5741 unsigned flags = curr->flags; 5595 unsigned flags = curr->flags;
5742 5596
5743 if (curr->func(curr, mode, sync, key) && 5597 if (curr->func(curr, mode, wake_flags, key) &&
5744 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5598 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5745 break; 5599 break;
5746 } 5600 }
@@ -5801,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5801 int nr_exclusive, void *key) 5655 int nr_exclusive, void *key)
5802{ 5656{
5803 unsigned long flags; 5657 unsigned long flags;
5804 int sync = 1; 5658 int wake_flags = WF_SYNC;
5805 5659
5806 if (unlikely(!q)) 5660 if (unlikely(!q))
5807 return; 5661 return;
5808 5662
5809 if (unlikely(!nr_exclusive)) 5663 if (unlikely(!nr_exclusive))
5810 sync = 0; 5664 wake_flags = 0;
5811 5665
5812 spin_lock_irqsave(&q->lock, flags); 5666 spin_lock_irqsave(&q->lock, flags);
5813 __wake_up_common(q, mode, nr_exclusive, sync, key); 5667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5814 spin_unlock_irqrestore(&q->lock, flags); 5668 spin_unlock_irqrestore(&q->lock, flags);
5815} 5669}
5816EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6866,9 +6720,6 @@ EXPORT_SYMBOL(yield);
6866/* 6720/*
6867 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6721 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6868 * that process accounting knows that this is a task in IO wait state. 6722 * that process accounting knows that this is a task in IO wait state.
6869 *
6870 * But don't do that if it is a deliberate, throttling IO wait (this task
6871 * has set its backing_dev_info: the queue against which it should throttle)
6872 */ 6723 */
6873void __sched io_schedule(void) 6724void __sched io_schedule(void)
6874{ 6725{
@@ -6977,23 +6828,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6977 if (retval) 6828 if (retval)
6978 goto out_unlock; 6829 goto out_unlock;
6979 6830
6980 /* 6831 time_slice = p->sched_class->get_rr_interval(p);
6981 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6982 * tasks that are on an otherwise idle runqueue:
6983 */
6984 time_slice = 0;
6985 if (p->policy == SCHED_RR) {
6986 time_slice = DEF_TIMESLICE;
6987 } else if (p->policy != SCHED_FIFO) {
6988 struct sched_entity *se = &p->se;
6989 unsigned long flags;
6990 struct rq *rq;
6991 6832
6992 rq = task_rq_lock(p, &flags);
6993 if (rq->cfs.load.weight)
6994 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6995 task_rq_unlock(rq, &flags);
6996 }
6997 read_unlock(&tasklist_lock); 6833 read_unlock(&tasklist_lock);
6998 jiffies_to_timespec(time_slice, &t); 6834 jiffies_to_timespec(time_slice, &t);
6999 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6835 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -7844,7 +7680,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7844/* 7680/*
7845 * Register at high priority so that task migration (migrate_all_tasks) 7681 * Register at high priority so that task migration (migrate_all_tasks)
7846 * happens before everything else. This has to be lower priority than 7682 * happens before everything else. This has to be lower priority than
7847 * the notifier in the perf_counter subsystem, though. 7683 * the notifier in the perf_event subsystem, though.
7848 */ 7684 */
7849static struct notifier_block __cpuinitdata migration_notifier = { 7685static struct notifier_block __cpuinitdata migration_notifier = {
7850 .notifier_call = migration_call, 7686 .notifier_call = migration_call,
@@ -8000,9 +7836,7 @@ static int sd_degenerate(struct sched_domain *sd)
8000 } 7836 }
8001 7837
8002 /* Following flags don't use groups */ 7838 /* Following flags don't use groups */
8003 if (sd->flags & (SD_WAKE_IDLE | 7839 if (sd->flags & (SD_WAKE_AFFINE))
8004 SD_WAKE_AFFINE |
8005 SD_WAKE_BALANCE))
8006 return 0; 7840 return 0;
8007 7841
8008 return 1; 7842 return 1;
@@ -8019,10 +7853,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8019 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7853 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
8020 return 0; 7854 return 0;
8021 7855
8022 /* Does parent contain flags not in child? */
8023 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
8024 if (cflags & SD_WAKE_AFFINE)
8025 pflags &= ~SD_WAKE_BALANCE;
8026 /* Flags needing groups don't count if only 1 group in parent */ 7856 /* Flags needing groups don't count if only 1 group in parent */
8027 if (parent->groups == parent->groups->next) { 7857 if (parent->groups == parent->groups->next) {
8028 pflags &= ~(SD_LOAD_BALANCE | 7858 pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8538,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8708 request = attr->relax_domain_level; 8538 request = attr->relax_domain_level;
8709 if (request < sd->level) { 8539 if (request < sd->level) {
8710 /* turn off idle balance on this domain */ 8540 /* turn off idle balance on this domain */
8711 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8541 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8712 } else { 8542 } else {
8713 /* turn on idle balance on this domain */ 8543 /* turn on idle balance on this domain */
8714 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8544 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8715 } 8545 }
8716} 8546}
8717 8547
@@ -9329,6 +9159,7 @@ void __init sched_init_smp(void)
9329 cpumask_var_t non_isolated_cpus; 9159 cpumask_var_t non_isolated_cpus;
9330 9160
9331 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9161 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9162 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9332 9163
9333#if defined(CONFIG_NUMA) 9164#if defined(CONFIG_NUMA)
9334 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9165 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9360,7 +9191,6 @@ void __init sched_init_smp(void)
9360 sched_init_granularity(); 9191 sched_init_granularity();
9361 free_cpumask_var(non_isolated_cpus); 9192 free_cpumask_var(non_isolated_cpus);
9362 9193
9363 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9364 init_sched_rt_class(); 9194 init_sched_rt_class();
9365} 9195}
9366#else 9196#else
@@ -9573,6 +9403,10 @@ void __init sched_init(void)
9573#endif /* CONFIG_USER_SCHED */ 9403#endif /* CONFIG_USER_SCHED */
9574#endif /* CONFIG_GROUP_SCHED */ 9404#endif /* CONFIG_GROUP_SCHED */
9575 9405
9406#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9407 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9408 __alignof__(unsigned long));
9409#endif
9576 for_each_possible_cpu(i) { 9410 for_each_possible_cpu(i) {
9577 struct rq *rq; 9411 struct rq *rq;
9578 9412
@@ -9707,7 +9541,7 @@ void __init sched_init(void)
9707 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9541 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9708#endif /* SMP */ 9542#endif /* SMP */
9709 9543
9710 perf_counter_init(); 9544 perf_event_init();
9711 9545
9712 scheduler_running = 1; 9546 scheduler_running = 1;
9713} 9547}
@@ -10479,7 +10313,7 @@ static int sched_rt_global_constraints(void)
10479#endif /* CONFIG_RT_GROUP_SCHED */ 10313#endif /* CONFIG_RT_GROUP_SCHED */
10480 10314
10481int sched_rt_handler(struct ctl_table *table, int write, 10315int sched_rt_handler(struct ctl_table *table, int write,
10482 struct file *filp, void __user *buffer, size_t *lenp, 10316 void __user *buffer, size_t *lenp,
10483 loff_t *ppos) 10317 loff_t *ppos)
10484{ 10318{
10485 int ret; 10319 int ret;
@@ -10490,7 +10324,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10490 old_period = sysctl_sched_rt_period; 10324 old_period = sysctl_sched_rt_period;
10491 old_runtime = sysctl_sched_rt_runtime; 10325 old_runtime = sysctl_sched_rt_runtime;
10492 10326
10493 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10327 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10494 10328
10495 if (!ret && write) { 10329 if (!ret && write) {
10496 ret = sched_rt_global_constraints(); 10330 ret = sched_rt_global_constraints();
@@ -10544,8 +10378,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10544} 10378}
10545 10379
10546static int 10380static int
10547cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10381cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10548 struct task_struct *tsk)
10549{ 10382{
10550#ifdef CONFIG_RT_GROUP_SCHED 10383#ifdef CONFIG_RT_GROUP_SCHED
10551 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10555,15 +10388,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10555 if (tsk->sched_class != &fair_sched_class) 10388 if (tsk->sched_class != &fair_sched_class)
10556 return -EINVAL; 10389 return -EINVAL;
10557#endif 10390#endif
10391 return 0;
10392}
10558 10393
10394static int
10395cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10396 struct task_struct *tsk, bool threadgroup)
10397{
10398 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10399 if (retval)
10400 return retval;
10401 if (threadgroup) {
10402 struct task_struct *c;
10403 rcu_read_lock();
10404 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10405 retval = cpu_cgroup_can_attach_task(cgrp, c);
10406 if (retval) {
10407 rcu_read_unlock();
10408 return retval;
10409 }
10410 }
10411 rcu_read_unlock();
10412 }
10559 return 0; 10413 return 0;
10560} 10414}
10561 10415
10562static void 10416static void
10563cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10417cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10564 struct cgroup *old_cont, struct task_struct *tsk) 10418 struct cgroup *old_cont, struct task_struct *tsk,
10419 bool threadgroup)
10565{ 10420{
10566 sched_move_task(tsk); 10421 sched_move_task(tsk);
10422 if (threadgroup) {
10423 struct task_struct *c;
10424 rcu_read_lock();
10425 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10426 sched_move_task(c);
10427 }
10428 rcu_read_unlock();
10429 }
10567} 10430}
10568 10431
10569#ifdef CONFIG_FAIR_GROUP_SCHED 10432#ifdef CONFIG_FAIR_GROUP_SCHED