aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c492
1 files changed, 342 insertions, 150 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index cbb3a0eee58e..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h> 77#include <asm/mutex.h>
78#ifdef CONFIG_PARAVIRT
79#include <asm/paravirt.h>
80#endif
78 81
79#include "sched_cpupri.h" 82#include "sched_cpupri.h"
80#include "workqueue_sched.h" 83#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
124 127
125static inline int rt_policy(int policy) 128static inline int rt_policy(int policy)
126{ 129{
127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 130 if (policy == SCHED_FIFO || policy == SCHED_RR)
128 return 1; 131 return 1;
129 return 0; 132 return 0;
130} 133}
@@ -292,8 +295,8 @@ static DEFINE_SPINLOCK(task_group_lock);
292 * (The default weight is 1024 - so there's no practical 295 * (The default weight is 1024 - so there's no practical
293 * limitation from this.) 296 * limitation from this.)
294 */ 297 */
295#define MIN_SHARES 2 298#define MIN_SHARES (1UL << 1)
296#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) 299#define MAX_SHARES (1UL << 18)
297 300
298static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 301static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
299#endif 302#endif
@@ -422,6 +425,7 @@ struct rt_rq {
422 */ 425 */
423struct root_domain { 426struct root_domain {
424 atomic_t refcount; 427 atomic_t refcount;
428 atomic_t rto_count;
425 struct rcu_head rcu; 429 struct rcu_head rcu;
426 cpumask_var_t span; 430 cpumask_var_t span;
427 cpumask_var_t online; 431 cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
431 * one runnable RT task. 435 * one runnable RT task.
432 */ 436 */
433 cpumask_var_t rto_mask; 437 cpumask_var_t rto_mask;
434 atomic_t rto_count;
435 struct cpupri cpupri; 438 struct cpupri cpupri;
436}; 439};
437 440
@@ -528,6 +531,12 @@ struct rq {
528#ifdef CONFIG_IRQ_TIME_ACCOUNTING 531#ifdef CONFIG_IRQ_TIME_ACCOUNTING
529 u64 prev_irq_time; 532 u64 prev_irq_time;
530#endif 533#endif
534#ifdef CONFIG_PARAVIRT
535 u64 prev_steal_time;
536#endif
537#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
538 u64 prev_steal_time_rq;
539#endif
531 540
532 /* calc_load related fields */ 541 /* calc_load related fields */
533 unsigned long calc_load_update; 542 unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
581 590
582#define rcu_dereference_check_sched_domain(p) \ 591#define rcu_dereference_check_sched_domain(p) \
583 rcu_dereference_check((p), \ 592 rcu_dereference_check((p), \
584 rcu_read_lock_held() || \
585 lockdep_is_held(&sched_domains_mutex)) 593 lockdep_is_held(&sched_domains_mutex))
586 594
587/* 595/*
@@ -605,10 +613,10 @@ static inline int cpu_of(struct rq *rq)
605/* 613/*
606 * Return the group to which this tasks belongs. 614 * Return the group to which this tasks belongs.
607 * 615 *
608 * We use task_subsys_state_check() and extend the RCU verification 616 * We use task_subsys_state_check() and extend the RCU verification with
609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() 617 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
610 * holds that lock for each task it moves into the cgroup. Therefore 618 * task it moves into the cgroup. Therefore by holding either of those locks,
611 * by holding that lock, we pin the task to the current cgroup. 619 * we pin the task to the current cgroup.
612 */ 620 */
613static inline struct task_group *task_group(struct task_struct *p) 621static inline struct task_group *task_group(struct task_struct *p)
614{ 622{
@@ -616,7 +624,8 @@ static inline struct task_group *task_group(struct task_struct *p)
616 struct cgroup_subsys_state *css; 624 struct cgroup_subsys_state *css;
617 625
618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 626 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
619 lockdep_is_held(&p->pi_lock)); 627 lockdep_is_held(&p->pi_lock) ||
628 lockdep_is_held(&task_rq(p)->lock));
620 tg = container_of(css, struct task_group, css); 629 tg = container_of(css, struct task_group, css);
621 630
622 return autogroup_task_group(p, tg); 631 return autogroup_task_group(p, tg);
@@ -1567,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1567 return rq->avg_load_per_task; 1576 return rq->avg_load_per_task;
1568} 1577}
1569 1578
1570#ifdef CONFIG_FAIR_GROUP_SCHED
1571
1572/*
1573 * Compute the cpu's hierarchical load factor for each task group.
1574 * This needs to be done in a top-down fashion because the load of a child
1575 * group is a fraction of its parents load.
1576 */
1577static int tg_load_down(struct task_group *tg, void *data)
1578{
1579 unsigned long load;
1580 long cpu = (long)data;
1581
1582 if (!tg->parent) {
1583 load = cpu_rq(cpu)->load.weight;
1584 } else {
1585 load = tg->parent->cfs_rq[cpu]->h_load;
1586 load *= tg->se[cpu]->load.weight;
1587 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1588 }
1589
1590 tg->cfs_rq[cpu]->h_load = load;
1591
1592 return 0;
1593}
1594
1595static void update_h_load(long cpu)
1596{
1597 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1598}
1599
1600#endif
1601
1602#ifdef CONFIG_PREEMPT 1579#ifdef CONFIG_PREEMPT
1603 1580
1604static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1581static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1952,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
1952} 1929}
1953EXPORT_SYMBOL_GPL(account_system_vtime); 1930EXPORT_SYMBOL_GPL(account_system_vtime);
1954 1931
1955static void update_rq_clock_task(struct rq *rq, s64 delta) 1932#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1933
1934#ifdef CONFIG_PARAVIRT
1935static inline u64 steal_ticks(u64 steal)
1956{ 1936{
1957 s64 irq_delta; 1937 if (unlikely(steal > NSEC_PER_SEC))
1938 return div_u64(steal, TICK_NSEC);
1958 1939
1940 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1941}
1942#endif
1943
1944static void update_rq_clock_task(struct rq *rq, s64 delta)
1945{
1946/*
1947 * In theory, the compile should just see 0 here, and optimize out the call
1948 * to sched_rt_avg_update. But I don't trust it...
1949 */
1950#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1951 s64 steal = 0, irq_delta = 0;
1952#endif
1953#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1959 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1954 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1960 1955
1961 /* 1956 /*
@@ -1978,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1978 1973
1979 rq->prev_irq_time += irq_delta; 1974 rq->prev_irq_time += irq_delta;
1980 delta -= irq_delta; 1975 delta -= irq_delta;
1976#endif
1977#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1978 if (static_branch((&paravirt_steal_rq_enabled))) {
1979 u64 st;
1980
1981 steal = paravirt_steal_clock(cpu_of(rq));
1982 steal -= rq->prev_steal_time_rq;
1983
1984 if (unlikely(steal > delta))
1985 steal = delta;
1986
1987 st = steal_ticks(steal);
1988 steal = st * TICK_NSEC;
1989
1990 rq->prev_steal_time_rq += steal;
1991
1992 delta -= steal;
1993 }
1994#endif
1995
1981 rq->clock_task += delta; 1996 rq->clock_task += delta;
1982 1997
1983 if (irq_delta && sched_feat(NONIRQ_POWER)) 1998#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1984 sched_rt_avg_update(rq, irq_delta); 1999 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2000 sched_rt_avg_update(rq, irq_delta + steal);
2001#endif
1985} 2002}
1986 2003
2004#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1987static int irqtime_account_hi_update(void) 2005static int irqtime_account_hi_update(void)
1988{ 2006{
1989 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2007 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2018,12 +2036,7 @@ static int irqtime_account_si_update(void)
2018 2036
2019#define sched_clock_irqtime (0) 2037#define sched_clock_irqtime (0)
2020 2038
2021static void update_rq_clock_task(struct rq *rq, s64 delta) 2039#endif
2022{
2023 rq->clock_task += delta;
2024}
2025
2026#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2027 2040
2028#include "sched_idletask.c" 2041#include "sched_idletask.c"
2029#include "sched_fair.c" 2042#include "sched_fair.c"
@@ -2200,6 +2213,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2213 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2201 2214
2202#ifdef CONFIG_LOCKDEP 2215#ifdef CONFIG_LOCKDEP
2216 /*
2217 * The caller should hold either p->pi_lock or rq->lock, when changing
2218 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2219 *
2220 * sched_move_task() holds both and thus holding either pins the cgroup,
2221 * see set_task_rq().
2222 *
2223 * Furthermore, all task_rq users should acquire both locks, see
2224 * task_rq_lock().
2225 */
2203 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 2226 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2204 lockdep_is_held(&task_rq(p)->lock))); 2227 lockdep_is_held(&task_rq(p)->lock)));
2205#endif 2228#endif
@@ -2209,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2209 2232
2210 if (task_cpu(p) != new_cpu) { 2233 if (task_cpu(p) != new_cpu) {
2211 p->se.nr_migrations++; 2234 p->se.nr_migrations++;
2212 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2235 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2213 } 2236 }
2214 2237
2215 __set_task_cpu(p, new_cpu); 2238 __set_task_cpu(p, new_cpu);
@@ -2447,6 +2470,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2447 } 2470 }
2448 rcu_read_unlock(); 2471 rcu_read_unlock();
2449 } 2472 }
2473
2474 if (wake_flags & WF_MIGRATED)
2475 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2476
2450#endif /* CONFIG_SMP */ 2477#endif /* CONFIG_SMP */
2451 2478
2452 schedstat_inc(rq, ttwu_count); 2479 schedstat_inc(rq, ttwu_count);
@@ -2455,9 +2482,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2455 if (wake_flags & WF_SYNC) 2482 if (wake_flags & WF_SYNC)
2456 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2483 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2457 2484
2458 if (cpu != task_cpu(p))
2459 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2460
2461#endif /* CONFIG_SCHEDSTATS */ 2485#endif /* CONFIG_SCHEDSTATS */
2462} 2486}
2463 2487
@@ -2485,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2485 if (p->sched_class->task_woken) 2509 if (p->sched_class->task_woken)
2486 p->sched_class->task_woken(rq, p); 2510 p->sched_class->task_woken(rq, p);
2487 2511
2488 if (unlikely(rq->idle_stamp)) { 2512 if (rq->idle_stamp) {
2489 u64 delta = rq->clock - rq->idle_stamp; 2513 u64 delta = rq->clock - rq->idle_stamp;
2490 u64 max = 2*sysctl_sched_migration_cost; 2514 u64 max = 2*sysctl_sched_migration_cost;
2491 2515
@@ -2532,13 +2556,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2532} 2556}
2533 2557
2534#ifdef CONFIG_SMP 2558#ifdef CONFIG_SMP
2535static void sched_ttwu_pending(void) 2559static void sched_ttwu_do_pending(struct task_struct *list)
2536{ 2560{
2537 struct rq *rq = this_rq(); 2561 struct rq *rq = this_rq();
2538 struct task_struct *list = xchg(&rq->wake_list, NULL);
2539
2540 if (!list)
2541 return;
2542 2562
2543 raw_spin_lock(&rq->lock); 2563 raw_spin_lock(&rq->lock);
2544 2564
@@ -2551,9 +2571,45 @@ static void sched_ttwu_pending(void)
2551 raw_spin_unlock(&rq->lock); 2571 raw_spin_unlock(&rq->lock);
2552} 2572}
2553 2573
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2554void scheduler_ipi(void) 2589void scheduler_ipi(void)
2555{ 2590{
2556 sched_ttwu_pending(); 2591 struct rq *rq = this_rq();
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return;
2596
2597 /*
2598 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2599 * traditionally all their work was done from the interrupt return
2600 * path. Now that we actually do some work, we need to make sure
2601 * we do call them.
2602 *
2603 * Some archs already do call them, luckily irq_enter/exit nest
2604 * properly.
2605 *
2606 * Arguably we should visit all archs and update all handlers,
2607 * however a fair share of IPIs are still resched only so this would
2608 * somewhat pessimize the simple resched case.
2609 */
2610 irq_enter();
2611 sched_ttwu_do_pending(list);
2612 irq_exit();
2557} 2613}
2558 2614
2559static void ttwu_queue_remote(struct task_struct *p, int cpu) 2615static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@ -2600,6 +2656,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2600 2656
2601#if defined(CONFIG_SMP) 2657#if defined(CONFIG_SMP)
2602 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 2658 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2659 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2603 ttwu_queue_remote(p, cpu); 2660 ttwu_queue_remote(p, cpu);
2604 return; 2661 return;
2605 } 2662 }
@@ -2674,8 +2731,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2674 p->sched_class->task_waking(p); 2731 p->sched_class->task_waking(p);
2675 2732
2676 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2733 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2677 if (task_cpu(p) != cpu) 2734 if (task_cpu(p) != cpu) {
2735 wake_flags |= WF_MIGRATED;
2678 set_task_cpu(p, cpu); 2736 set_task_cpu(p, cpu);
2737 }
2679#endif /* CONFIG_SMP */ 2738#endif /* CONFIG_SMP */
2680 2739
2681 ttwu_queue(p, cpu); 2740 ttwu_queue(p, cpu);
@@ -2839,7 +2898,7 @@ void sched_fork(struct task_struct *p)
2839#if defined(CONFIG_SMP) 2898#if defined(CONFIG_SMP)
2840 p->on_cpu = 0; 2899 p->on_cpu = 0;
2841#endif 2900#endif
2842#ifdef CONFIG_PREEMPT 2901#ifdef CONFIG_PREEMPT_COUNT
2843 /* Want to start with kernel preemption disabled. */ 2902 /* Want to start with kernel preemption disabled. */
2844 task_thread_info(p)->preempt_count = 1; 2903 task_thread_info(p)->preempt_count = 1;
2845#endif 2904#endif
@@ -3830,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
3830 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3889 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3831} 3890}
3832 3891
3892static __always_inline bool steal_account_process_tick(void)
3893{
3894#ifdef CONFIG_PARAVIRT
3895 if (static_branch(&paravirt_steal_enabled)) {
3896 u64 steal, st = 0;
3897
3898 steal = paravirt_steal_clock(smp_processor_id());
3899 steal -= this_rq()->prev_steal_time;
3900
3901 st = steal_ticks(steal);
3902 this_rq()->prev_steal_time += st * TICK_NSEC;
3903
3904 account_steal_time(st);
3905 return st;
3906 }
3907#endif
3908 return false;
3909}
3910
3833#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3911#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3834 3912
3835#ifdef CONFIG_IRQ_TIME_ACCOUNTING 3913#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3861,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3861 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3939 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3862 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3940 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3863 3941
3942 if (steal_account_process_tick())
3943 return;
3944
3864 if (irqtime_account_hi_update()) { 3945 if (irqtime_account_hi_update()) {
3865 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3946 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3866 } else if (irqtime_account_si_update()) { 3947 } else if (irqtime_account_si_update()) {
@@ -3914,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
3914 return; 3995 return;
3915 } 3996 }
3916 3997
3998 if (steal_account_process_tick())
3999 return;
4000
3917 if (user_tick) 4001 if (user_tick)
3918 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4002 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3919 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4003 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4291,11 +4375,8 @@ EXPORT_SYMBOL(schedule);
4291 4375
4292static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4376static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4293{ 4377{
4294 bool ret = false;
4295
4296 rcu_read_lock();
4297 if (lock->owner != owner) 4378 if (lock->owner != owner)
4298 goto fail; 4379 return false;
4299 4380
4300 /* 4381 /*
4301 * Ensure we emit the owner->on_cpu, dereference _after_ checking 4382 * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4305,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4305 */ 4386 */
4306 barrier(); 4387 barrier();
4307 4388
4308 ret = owner->on_cpu; 4389 return owner->on_cpu;
4309fail:
4310 rcu_read_unlock();
4311
4312 return ret;
4313} 4390}
4314 4391
4315/* 4392/*
@@ -4321,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4321 if (!sched_feat(OWNER_SPIN)) 4398 if (!sched_feat(OWNER_SPIN))
4322 return 0; 4399 return 0;
4323 4400
4401 rcu_read_lock();
4324 while (owner_running(lock, owner)) { 4402 while (owner_running(lock, owner)) {
4325 if (need_resched()) 4403 if (need_resched())
4326 return 0; 4404 break;
4327 4405
4328 arch_mutex_cpu_relax(); 4406 arch_mutex_cpu_relax();
4329 } 4407 }
4408 rcu_read_unlock();
4330 4409
4331 /* 4410 /*
4332 * If the owner changed to another task there is likely 4411 * We break out the loop above on need_resched() and when the
4333 * heavy contention, stop spinning. 4412 * owner changed, which is a sign for heavy contention. Return
4413 * success only when lock->owner is NULL.
4334 */ 4414 */
4335 if (lock->owner) 4415 return lock->owner == NULL;
4336 return 0;
4337
4338 return 1;
4339} 4416}
4340#endif 4417#endif
4341 4418
@@ -6542,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6542 break; 6619 break;
6543 } 6620 }
6544 6621
6545 if (!group->cpu_power) { 6622 if (!group->sgp->power) {
6546 printk(KERN_CONT "\n"); 6623 printk(KERN_CONT "\n");
6547 printk(KERN_ERR "ERROR: domain->cpu_power not " 6624 printk(KERN_ERR "ERROR: domain->cpu_power not "
6548 "set\n"); 6625 "set\n");
@@ -6566,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6566 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6643 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6567 6644
6568 printk(KERN_CONT " %s", str); 6645 printk(KERN_CONT " %s", str);
6569 if (group->cpu_power != SCHED_POWER_SCALE) { 6646 if (group->sgp->power != SCHED_POWER_SCALE) {
6570 printk(KERN_CONT " (cpu_power = %d)", 6647 printk(KERN_CONT " (cpu_power = %d)",
6571 group->cpu_power); 6648 group->sgp->power);
6572 } 6649 }
6573 6650
6574 group = group->next; 6651 group = group->next;
@@ -6759,11 +6836,39 @@ static struct root_domain *alloc_rootdomain(void)
6759 return rd; 6836 return rd;
6760} 6837}
6761 6838
6839static void free_sched_groups(struct sched_group *sg, int free_sgp)
6840{
6841 struct sched_group *tmp, *first;
6842
6843 if (!sg)
6844 return;
6845
6846 first = sg;
6847 do {
6848 tmp = sg->next;
6849
6850 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6851 kfree(sg->sgp);
6852
6853 kfree(sg);
6854 sg = tmp;
6855 } while (sg != first);
6856}
6857
6762static void free_sched_domain(struct rcu_head *rcu) 6858static void free_sched_domain(struct rcu_head *rcu)
6763{ 6859{
6764 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6860 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6765 if (atomic_dec_and_test(&sd->groups->ref)) 6861
6862 /*
6863 * If its an overlapping domain it has private groups, iterate and
6864 * nuke them all.
6865 */
6866 if (sd->flags & SD_OVERLAP) {
6867 free_sched_groups(sd->groups, 1);
6868 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6869 kfree(sd->groups->sgp);
6766 kfree(sd->groups); 6870 kfree(sd->groups);
6871 }
6767 kfree(sd); 6872 kfree(sd);
6768} 6873}
6769 6874
@@ -6930,6 +7035,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6930struct sd_data { 7035struct sd_data {
6931 struct sched_domain **__percpu sd; 7036 struct sched_domain **__percpu sd;
6932 struct sched_group **__percpu sg; 7037 struct sched_group **__percpu sg;
7038 struct sched_group_power **__percpu sgp;
6933}; 7039};
6934 7040
6935struct s_data { 7041struct s_data {
@@ -6949,15 +7055,73 @@ struct sched_domain_topology_level;
6949typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 7055typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6950typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 7056typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6951 7057
7058#define SDTL_OVERLAP 0x01
7059
6952struct sched_domain_topology_level { 7060struct sched_domain_topology_level {
6953 sched_domain_init_f init; 7061 sched_domain_init_f init;
6954 sched_domain_mask_f mask; 7062 sched_domain_mask_f mask;
7063 int flags;
6955 struct sd_data data; 7064 struct sd_data data;
6956}; 7065};
6957 7066
6958/* 7067static int
6959 * Assumes the sched_domain tree is fully constructed 7068build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6960 */ 7069{
7070 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
7071 const struct cpumask *span = sched_domain_span(sd);
7072 struct cpumask *covered = sched_domains_tmpmask;
7073 struct sd_data *sdd = sd->private;
7074 struct sched_domain *child;
7075 int i;
7076
7077 cpumask_clear(covered);
7078
7079 for_each_cpu(i, span) {
7080 struct cpumask *sg_span;
7081
7082 if (cpumask_test_cpu(i, covered))
7083 continue;
7084
7085 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7086 GFP_KERNEL, cpu_to_node(i));
7087
7088 if (!sg)
7089 goto fail;
7090
7091 sg_span = sched_group_cpus(sg);
7092
7093 child = *per_cpu_ptr(sdd->sd, i);
7094 if (child->child) {
7095 child = child->child;
7096 cpumask_copy(sg_span, sched_domain_span(child));
7097 } else
7098 cpumask_set_cpu(i, sg_span);
7099
7100 cpumask_or(covered, covered, sg_span);
7101
7102 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7103 atomic_inc(&sg->sgp->ref);
7104
7105 if (cpumask_test_cpu(cpu, sg_span))
7106 groups = sg;
7107
7108 if (!first)
7109 first = sg;
7110 if (last)
7111 last->next = sg;
7112 last = sg;
7113 last->next = first;
7114 }
7115 sd->groups = groups;
7116
7117 return 0;
7118
7119fail:
7120 free_sched_groups(first, 0);
7121
7122 return -ENOMEM;
7123}
7124
6961static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 7125static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6962{ 7126{
6963 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 7127 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6966,24 +7130,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6966 if (child) 7130 if (child)
6967 cpu = cpumask_first(sched_domain_span(child)); 7131 cpu = cpumask_first(sched_domain_span(child));
6968 7132
6969 if (sg) 7133 if (sg) {
6970 *sg = *per_cpu_ptr(sdd->sg, cpu); 7134 *sg = *per_cpu_ptr(sdd->sg, cpu);
7135 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7136 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
7137 }
6971 7138
6972 return cpu; 7139 return cpu;
6973} 7140}
6974 7141
6975/* 7142/*
6976 * build_sched_groups takes the cpumask we wish to span, and a pointer
6977 * to a function which identifies what group(along with sched group) a CPU
6978 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6979 * (due to the fact that we keep track of groups covered with a struct cpumask).
6980 *
6981 * build_sched_groups will build a circular linked list of the groups 7143 * build_sched_groups will build a circular linked list of the groups
6982 * covered by the given span, and will set each group's ->cpumask correctly, 7144 * covered by the given span, and will set each group's ->cpumask correctly,
6983 * and ->cpu_power to 0. 7145 * and ->cpu_power to 0.
7146 *
7147 * Assumes the sched_domain tree is fully constructed
6984 */ 7148 */
6985static void 7149static int
6986build_sched_groups(struct sched_domain *sd) 7150build_sched_groups(struct sched_domain *sd, int cpu)
6987{ 7151{
6988 struct sched_group *first = NULL, *last = NULL; 7152 struct sched_group *first = NULL, *last = NULL;
6989 struct sd_data *sdd = sd->private; 7153 struct sd_data *sdd = sd->private;
@@ -6991,6 +7155,12 @@ build_sched_groups(struct sched_domain *sd)
6991 struct cpumask *covered; 7155 struct cpumask *covered;
6992 int i; 7156 int i;
6993 7157
7158 get_group(cpu, sdd, &sd->groups);
7159 atomic_inc(&sd->groups->ref);
7160
7161 if (cpu != cpumask_first(sched_domain_span(sd)))
7162 return 0;
7163
6994 lockdep_assert_held(&sched_domains_mutex); 7164 lockdep_assert_held(&sched_domains_mutex);
6995 covered = sched_domains_tmpmask; 7165 covered = sched_domains_tmpmask;
6996 7166
@@ -7005,7 +7175,7 @@ build_sched_groups(struct sched_domain *sd)
7005 continue; 7175 continue;
7006 7176
7007 cpumask_clear(sched_group_cpus(sg)); 7177 cpumask_clear(sched_group_cpus(sg));
7008 sg->cpu_power = 0; 7178 sg->sgp->power = 0;
7009 7179
7010 for_each_cpu(j, span) { 7180 for_each_cpu(j, span) {
7011 if (get_group(j, sdd, NULL) != group) 7181 if (get_group(j, sdd, NULL) != group)
@@ -7022,6 +7192,8 @@ build_sched_groups(struct sched_domain *sd)
7022 last = sg; 7192 last = sg;
7023 } 7193 }
7024 last->next = first; 7194 last->next = first;
7195
7196 return 0;
7025} 7197}
7026 7198
7027/* 7199/*
@@ -7036,12 +7208,17 @@ build_sched_groups(struct sched_domain *sd)
7036 */ 7208 */
7037static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7209static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7038{ 7210{
7039 WARN_ON(!sd || !sd->groups); 7211 struct sched_group *sg = sd->groups;
7040 7212
7041 if (cpu != group_first_cpu(sd->groups)) 7213 WARN_ON(!sd || !sg);
7042 return;
7043 7214
7044 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7215 do {
7216 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7217 sg = sg->next;
7218 } while (sg != sd->groups);
7219
7220 if (cpu != group_first_cpu(sg))
7221 return;
7045 7222
7046 update_group_power(sd, cpu); 7223 update_group_power(sd, cpu);
7047} 7224}
@@ -7162,15 +7339,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7162static void claim_allocations(int cpu, struct sched_domain *sd) 7339static void claim_allocations(int cpu, struct sched_domain *sd)
7163{ 7340{
7164 struct sd_data *sdd = sd->private; 7341 struct sd_data *sdd = sd->private;
7165 struct sched_group *sg = sd->groups;
7166 7342
7167 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7343 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7168 *per_cpu_ptr(sdd->sd, cpu) = NULL; 7344 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7169 7345
7170 if (cpu == cpumask_first(sched_group_cpus(sg))) { 7346 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7171 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7172 *per_cpu_ptr(sdd->sg, cpu) = NULL; 7347 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7173 } 7348
7349 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7350 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7174} 7351}
7175 7352
7176#ifdef CONFIG_SCHED_SMT 7353#ifdef CONFIG_SCHED_SMT
@@ -7195,7 +7372,7 @@ static struct sched_domain_topology_level default_topology[] = {
7195#endif 7372#endif
7196 { sd_init_CPU, cpu_cpu_mask, }, 7373 { sd_init_CPU, cpu_cpu_mask, },
7197#ifdef CONFIG_NUMA 7374#ifdef CONFIG_NUMA
7198 { sd_init_NODE, cpu_node_mask, }, 7375 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7199 { sd_init_ALLNODES, cpu_allnodes_mask, }, 7376 { sd_init_ALLNODES, cpu_allnodes_mask, },
7200#endif 7377#endif
7201 { NULL, }, 7378 { NULL, },
@@ -7219,9 +7396,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
7219 if (!sdd->sg) 7396 if (!sdd->sg)
7220 return -ENOMEM; 7397 return -ENOMEM;
7221 7398
7399 sdd->sgp = alloc_percpu(struct sched_group_power *);
7400 if (!sdd->sgp)
7401 return -ENOMEM;
7402
7222 for_each_cpu(j, cpu_map) { 7403 for_each_cpu(j, cpu_map) {
7223 struct sched_domain *sd; 7404 struct sched_domain *sd;
7224 struct sched_group *sg; 7405 struct sched_group *sg;
7406 struct sched_group_power *sgp;
7225 7407
7226 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 7408 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7227 GFP_KERNEL, cpu_to_node(j)); 7409 GFP_KERNEL, cpu_to_node(j));
@@ -7236,6 +7418,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
7236 return -ENOMEM; 7418 return -ENOMEM;
7237 7419
7238 *per_cpu_ptr(sdd->sg, j) = sg; 7420 *per_cpu_ptr(sdd->sg, j) = sg;
7421
7422 sgp = kzalloc_node(sizeof(struct sched_group_power),
7423 GFP_KERNEL, cpu_to_node(j));
7424 if (!sgp)
7425 return -ENOMEM;
7426
7427 *per_cpu_ptr(sdd->sgp, j) = sgp;
7239 } 7428 }
7240 } 7429 }
7241 7430
@@ -7251,11 +7440,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
7251 struct sd_data *sdd = &tl->data; 7440 struct sd_data *sdd = &tl->data;
7252 7441
7253 for_each_cpu(j, cpu_map) { 7442 for_each_cpu(j, cpu_map) {
7254 kfree(*per_cpu_ptr(sdd->sd, j)); 7443 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7444 if (sd && (sd->flags & SD_OVERLAP))
7445 free_sched_groups(sd->groups, 0);
7255 kfree(*per_cpu_ptr(sdd->sg, j)); 7446 kfree(*per_cpu_ptr(sdd->sg, j));
7447 kfree(*per_cpu_ptr(sdd->sgp, j));
7256 } 7448 }
7257 free_percpu(sdd->sd); 7449 free_percpu(sdd->sd);
7258 free_percpu(sdd->sg); 7450 free_percpu(sdd->sg);
7451 free_percpu(sdd->sgp);
7259 } 7452 }
7260} 7453}
7261 7454
@@ -7301,8 +7494,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
7301 struct sched_domain_topology_level *tl; 7494 struct sched_domain_topology_level *tl;
7302 7495
7303 sd = NULL; 7496 sd = NULL;
7304 for (tl = sched_domain_topology; tl->init; tl++) 7497 for (tl = sched_domain_topology; tl->init; tl++) {
7305 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7498 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7499 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7500 sd->flags |= SD_OVERLAP;
7501 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7502 break;
7503 }
7306 7504
7307 while (sd->child) 7505 while (sd->child)
7308 sd = sd->child; 7506 sd = sd->child;
@@ -7314,13 +7512,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
7314 for_each_cpu(i, cpu_map) { 7512 for_each_cpu(i, cpu_map) {
7315 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7513 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7316 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7514 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7317 get_group(i, sd->private, &sd->groups); 7515 if (sd->flags & SD_OVERLAP) {
7318 atomic_inc(&sd->groups->ref); 7516 if (build_overlap_sched_groups(sd, i))
7319 7517 goto error;
7320 if (i != cpumask_first(sched_domain_span(sd))) 7518 } else {
7321 continue; 7519 if (build_sched_groups(sd, i))
7322 7520 goto error;
7323 build_sched_groups(sd); 7521 }
7324 } 7522 }
7325 } 7523 }
7326 7524
@@ -7730,18 +7928,14 @@ int in_sched_functions(unsigned long addr)
7730 && addr < (unsigned long)__sched_text_end); 7928 && addr < (unsigned long)__sched_text_end);
7731} 7929}
7732 7930
7733static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7931static void init_cfs_rq(struct cfs_rq *cfs_rq)
7734{ 7932{
7735 cfs_rq->tasks_timeline = RB_ROOT; 7933 cfs_rq->tasks_timeline = RB_ROOT;
7736 INIT_LIST_HEAD(&cfs_rq->tasks); 7934 INIT_LIST_HEAD(&cfs_rq->tasks);
7737#ifdef CONFIG_FAIR_GROUP_SCHED
7738 cfs_rq->rq = rq;
7739 /* allow initial update_cfs_load() to truncate */
7740#ifdef CONFIG_SMP
7741 cfs_rq->load_stamp = 1;
7742#endif
7743#endif
7744 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7935 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7936#ifndef CONFIG_64BIT
7937 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7938#endif
7745} 7939}
7746 7940
7747static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7941static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7757,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7757 /* delimiter for bitsearch: */ 7951 /* delimiter for bitsearch: */
7758 __set_bit(MAX_RT_PRIO, array->bitmap); 7952 __set_bit(MAX_RT_PRIO, array->bitmap);
7759 7953
7760#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7954#if defined CONFIG_SMP
7761 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7955 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7762#ifdef CONFIG_SMP
7763 rt_rq->highest_prio.next = MAX_RT_PRIO; 7956 rt_rq->highest_prio.next = MAX_RT_PRIO;
7764#endif
7765#endif
7766#ifdef CONFIG_SMP
7767 rt_rq->rt_nr_migratory = 0; 7957 rt_rq->rt_nr_migratory = 0;
7768 rt_rq->overloaded = 0; 7958 rt_rq->overloaded = 0;
7769 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7959 plist_head_init(&rt_rq->pushable_tasks);
7770#endif 7960#endif
7771 7961
7772 rt_rq->rt_time = 0; 7962 rt_rq->rt_time = 0;
7773 rt_rq->rt_throttled = 0; 7963 rt_rq->rt_throttled = 0;
7774 rt_rq->rt_runtime = 0; 7964 rt_rq->rt_runtime = 0;
7775 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7965 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7776
7777#ifdef CONFIG_RT_GROUP_SCHED
7778 rt_rq->rt_nr_boosted = 0;
7779 rt_rq->rq = rq;
7780#endif
7781} 7966}
7782 7967
7783#ifdef CONFIG_FAIR_GROUP_SCHED 7968#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7786,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7786 struct sched_entity *parent) 7971 struct sched_entity *parent)
7787{ 7972{
7788 struct rq *rq = cpu_rq(cpu); 7973 struct rq *rq = cpu_rq(cpu);
7789 tg->cfs_rq[cpu] = cfs_rq; 7974
7790 init_cfs_rq(cfs_rq, rq);
7791 cfs_rq->tg = tg; 7975 cfs_rq->tg = tg;
7976 cfs_rq->rq = rq;
7977#ifdef CONFIG_SMP
7978 /* allow initial update_cfs_load() to truncate */
7979 cfs_rq->load_stamp = 1;
7980#endif
7792 7981
7982 tg->cfs_rq[cpu] = cfs_rq;
7793 tg->se[cpu] = se; 7983 tg->se[cpu] = se;
7984
7794 /* se could be NULL for root_task_group */ 7985 /* se could be NULL for root_task_group */
7795 if (!se) 7986 if (!se)
7796 return; 7987 return;
@@ -7813,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7813{ 8004{
7814 struct rq *rq = cpu_rq(cpu); 8005 struct rq *rq = cpu_rq(cpu);
7815 8006
7816 tg->rt_rq[cpu] = rt_rq; 8007 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7817 init_rt_rq(rt_rq, rq); 8008 rt_rq->rt_nr_boosted = 0;
8009 rt_rq->rq = rq;
7818 rt_rq->tg = tg; 8010 rt_rq->tg = tg;
7819 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7820 8011
8012 tg->rt_rq[cpu] = rt_rq;
7821 tg->rt_se[cpu] = rt_se; 8013 tg->rt_se[cpu] = rt_se;
8014
7822 if (!rt_se) 8015 if (!rt_se)
7823 return; 8016 return;
7824 8017
@@ -7900,7 +8093,7 @@ void __init sched_init(void)
7900 rq->nr_running = 0; 8093 rq->nr_running = 0;
7901 rq->calc_load_active = 0; 8094 rq->calc_load_active = 0;
7902 rq->calc_load_update = jiffies + LOAD_FREQ; 8095 rq->calc_load_update = jiffies + LOAD_FREQ;
7903 init_cfs_rq(&rq->cfs, rq); 8096 init_cfs_rq(&rq->cfs);
7904 init_rt_rq(&rq->rt, rq); 8097 init_rt_rq(&rq->rt, rq);
7905#ifdef CONFIG_FAIR_GROUP_SCHED 8098#ifdef CONFIG_FAIR_GROUP_SCHED
7906 root_task_group.shares = root_task_group_load; 8099 root_task_group.shares = root_task_group_load;
@@ -7971,7 +8164,7 @@ void __init sched_init(void)
7971#endif 8164#endif
7972 8165
7973#ifdef CONFIG_RT_MUTEXES 8166#ifdef CONFIG_RT_MUTEXES
7974 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8167 plist_head_init(&init_task.pi_waiters);
7975#endif 8168#endif
7976 8169
7977 /* 8170 /*
@@ -8014,7 +8207,7 @@ void __init sched_init(void)
8014 scheduler_running = 1; 8207 scheduler_running = 1;
8015} 8208}
8016 8209
8017#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8210#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8018static inline int preempt_count_equals(int preempt_offset) 8211static inline int preempt_count_equals(int preempt_offset)
8019{ 8212{
8020 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8213 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8024,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
8024 8217
8025void __might_sleep(const char *file, int line, int preempt_offset) 8218void __might_sleep(const char *file, int line, int preempt_offset)
8026{ 8219{
8027#ifdef in_atomic
8028 static unsigned long prev_jiffy; /* ratelimiting */ 8220 static unsigned long prev_jiffy; /* ratelimiting */
8029 8221
8030 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8222 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8046,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8046 if (irqs_disabled()) 8238 if (irqs_disabled())
8047 print_irqtrace_events(current); 8239 print_irqtrace_events(current);
8048 dump_stack(); 8240 dump_stack();
8049#endif
8050} 8241}
8051EXPORT_SYMBOL(__might_sleep); 8242EXPORT_SYMBOL(__might_sleep);
8052#endif 8243#endif
@@ -8205,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8205 if (!se) 8396 if (!se)
8206 goto err_free_rq; 8397 goto err_free_rq;
8207 8398
8399 init_cfs_rq(cfs_rq);
8208 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8400 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8209 } 8401 }
8210 8402
@@ -8232,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8232 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8424 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8233 raw_spin_unlock_irqrestore(&rq->lock, flags); 8425 raw_spin_unlock_irqrestore(&rq->lock, flags);
8234} 8426}
8235#else /* !CONFG_FAIR_GROUP_SCHED */ 8427#else /* !CONFIG_FAIR_GROUP_SCHED */
8236static inline void free_fair_sched_group(struct task_group *tg) 8428static inline void free_fair_sched_group(struct task_group *tg)
8237{ 8429{
8238} 8430}
@@ -8253,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
8253{ 8445{
8254 int i; 8446 int i;
8255 8447
8256 destroy_rt_bandwidth(&tg->rt_bandwidth); 8448 if (tg->rt_se)
8449 destroy_rt_bandwidth(&tg->rt_bandwidth);
8257 8450
8258 for_each_possible_cpu(i) { 8451 for_each_possible_cpu(i) {
8259 if (tg->rt_rq) 8452 if (tg->rt_rq)
@@ -8294,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8294 if (!rt_se) 8487 if (!rt_se)
8295 goto err_free_rq; 8488 goto err_free_rq;
8296 8489
8490 init_rt_rq(rt_rq, cpu_rq(i));
8491 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8297 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8492 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8298 } 8493 }
8299 8494
@@ -8435,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8435 if (!tg->se[0]) 8630 if (!tg->se[0])
8436 return -EINVAL; 8631 return -EINVAL;
8437 8632
8438 if (shares < MIN_SHARES) 8633 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8439 shares = MIN_SHARES;
8440 else if (shares > MAX_SHARES)
8441 shares = MAX_SHARES;
8442 8634
8443 mutex_lock(&shares_mutex); 8635 mutex_lock(&shares_mutex);
8444 if (tg->shares == shares) 8636 if (tg->shares == shares)