aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c432
1 files changed, 257 insertions, 175 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238c..c6b98793d647 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,22 @@
90#define CREATE_TRACE_POINTS 90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h> 91#include <trace/events/sched.h>
92 92
93#ifdef smp_mb__before_atomic
94void __smp_mb__before_atomic(void)
95{
96 smp_mb__before_atomic();
97}
98EXPORT_SYMBOL(__smp_mb__before_atomic);
99#endif
100
101#ifdef smp_mb__after_atomic
102void __smp_mb__after_atomic(void)
103{
104 smp_mb__after_atomic();
105}
106EXPORT_SYMBOL(__smp_mb__after_atomic);
107#endif
108
93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 109void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
94{ 110{
95 unsigned long delta; 111 unsigned long delta;
@@ -506,6 +522,39 @@ static inline void init_hrtick(void)
506#endif /* CONFIG_SCHED_HRTICK */ 522#endif /* CONFIG_SCHED_HRTICK */
507 523
508/* 524/*
525 * cmpxchg based fetch_or, macro so it works for different integer types
526 */
527#define fetch_or(ptr, val) \
528({ typeof(*(ptr)) __old, __val = *(ptr); \
529 for (;;) { \
530 __old = cmpxchg((ptr), __val, __val | (val)); \
531 if (__old == __val) \
532 break; \
533 __val = __old; \
534 } \
535 __old; \
536})
537
538#ifdef TIF_POLLING_NRFLAG
539/*
540 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
541 * this avoids any races wrt polling state changes and thereby avoids
542 * spurious IPIs.
543 */
544static bool set_nr_and_not_polling(struct task_struct *p)
545{
546 struct thread_info *ti = task_thread_info(p);
547 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
548}
549#else
550static bool set_nr_and_not_polling(struct task_struct *p)
551{
552 set_tsk_need_resched(p);
553 return true;
554}
555#endif
556
557/*
509 * resched_task - mark a task 'to be rescheduled now'. 558 * resched_task - mark a task 'to be rescheduled now'.
510 * 559 *
511 * On UP this means the setting of the need_resched flag, on SMP it 560 * On UP this means the setting of the need_resched flag, on SMP it
@@ -521,17 +570,15 @@ void resched_task(struct task_struct *p)
521 if (test_tsk_need_resched(p)) 570 if (test_tsk_need_resched(p))
522 return; 571 return;
523 572
524 set_tsk_need_resched(p);
525
526 cpu = task_cpu(p); 573 cpu = task_cpu(p);
574
527 if (cpu == smp_processor_id()) { 575 if (cpu == smp_processor_id()) {
576 set_tsk_need_resched(p);
528 set_preempt_need_resched(); 577 set_preempt_need_resched();
529 return; 578 return;
530 } 579 }
531 580
532 /* NEED_RESCHED must be visible before we test polling */ 581 if (set_nr_and_not_polling(p))
533 smp_mb();
534 if (!tsk_is_polling(p))
535 smp_send_reschedule(cpu); 582 smp_send_reschedule(cpu);
536} 583}
537 584
@@ -1320,7 +1367,7 @@ out:
1320 * leave kernel. 1367 * leave kernel.
1321 */ 1368 */
1322 if (p->mm && printk_ratelimit()) { 1369 if (p->mm && printk_ratelimit()) {
1323 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1370 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1324 task_pid_nr(p), p->comm, cpu); 1371 task_pid_nr(p), p->comm, cpu);
1325 } 1372 }
1326 } 1373 }
@@ -2192,7 +2239,7 @@ static inline void post_schedule(struct rq *rq)
2192 * schedule_tail - first thing a freshly forked thread must call. 2239 * schedule_tail - first thing a freshly forked thread must call.
2193 * @prev: the thread we just switched away from. 2240 * @prev: the thread we just switched away from.
2194 */ 2241 */
2195asmlinkage void schedule_tail(struct task_struct *prev) 2242asmlinkage __visible void schedule_tail(struct task_struct *prev)
2196 __releases(rq->lock) 2243 __releases(rq->lock)
2197{ 2244{
2198 struct rq *rq = this_rq(); 2245 struct rq *rq = this_rq();
@@ -2592,8 +2639,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
2592 if (likely(prev->sched_class == class && 2639 if (likely(prev->sched_class == class &&
2593 rq->nr_running == rq->cfs.h_nr_running)) { 2640 rq->nr_running == rq->cfs.h_nr_running)) {
2594 p = fair_sched_class.pick_next_task(rq, prev); 2641 p = fair_sched_class.pick_next_task(rq, prev);
2595 if (likely(p && p != RETRY_TASK)) 2642 if (unlikely(p == RETRY_TASK))
2596 return p; 2643 goto again;
2644
2645 /* assumes fair_sched_class->next == idle_sched_class */
2646 if (unlikely(!p))
2647 p = idle_sched_class.pick_next_task(rq, prev);
2648
2649 return p;
2597 } 2650 }
2598 2651
2599again: 2652again:
@@ -2741,7 +2794,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
2741 blk_schedule_flush_plug(tsk); 2794 blk_schedule_flush_plug(tsk);
2742} 2795}
2743 2796
2744asmlinkage void __sched schedule(void) 2797asmlinkage __visible void __sched schedule(void)
2745{ 2798{
2746 struct task_struct *tsk = current; 2799 struct task_struct *tsk = current;
2747 2800
@@ -2751,7 +2804,7 @@ asmlinkage void __sched schedule(void)
2751EXPORT_SYMBOL(schedule); 2804EXPORT_SYMBOL(schedule);
2752 2805
2753#ifdef CONFIG_CONTEXT_TRACKING 2806#ifdef CONFIG_CONTEXT_TRACKING
2754asmlinkage void __sched schedule_user(void) 2807asmlinkage __visible void __sched schedule_user(void)
2755{ 2808{
2756 /* 2809 /*
2757 * If we come here after a random call to set_need_resched(), 2810 * If we come here after a random call to set_need_resched(),
@@ -2783,7 +2836,7 @@ void __sched schedule_preempt_disabled(void)
2783 * off of preempt_enable. Kernel preemptions off return from interrupt 2836 * off of preempt_enable. Kernel preemptions off return from interrupt
2784 * occur there and call schedule directly. 2837 * occur there and call schedule directly.
2785 */ 2838 */
2786asmlinkage void __sched notrace preempt_schedule(void) 2839asmlinkage __visible void __sched notrace preempt_schedule(void)
2787{ 2840{
2788 /* 2841 /*
2789 * If there is a non-zero preempt_count or interrupts are disabled, 2842 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2813,7 +2866,7 @@ EXPORT_SYMBOL(preempt_schedule);
2813 * Note, that this is called and return with irqs disabled. This will 2866 * Note, that this is called and return with irqs disabled. This will
2814 * protect us against recursive calling from irq. 2867 * protect us against recursive calling from irq.
2815 */ 2868 */
2816asmlinkage void __sched preempt_schedule_irq(void) 2869asmlinkage __visible void __sched preempt_schedule_irq(void)
2817{ 2870{
2818 enum ctx_state prev_state; 2871 enum ctx_state prev_state;
2819 2872
@@ -2996,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
2996int can_nice(const struct task_struct *p, const int nice) 3049int can_nice(const struct task_struct *p, const int nice)
2997{ 3050{
2998 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3051 /* convert nice value [19,-20] to rlimit style value [1,40] */
2999 int nice_rlim = 20 - nice; 3052 int nice_rlim = nice_to_rlimit(nice);
3000 3053
3001 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3054 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3002 capable(CAP_SYS_NICE)); 3055 capable(CAP_SYS_NICE));
@@ -3020,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
3020 * We don't have to worry. Conceptually one call occurs first 3073 * We don't have to worry. Conceptually one call occurs first
3021 * and we have a single winner. 3074 * and we have a single winner.
3022 */ 3075 */
3023 if (increment < -40) 3076 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3024 increment = -40;
3025 if (increment > 40)
3026 increment = 40;
3027
3028 nice = task_nice(current) + increment; 3077 nice = task_nice(current) + increment;
3029 if (nice < MIN_NICE)
3030 nice = MIN_NICE;
3031 if (nice > MAX_NICE)
3032 nice = MAX_NICE;
3033 3078
3079 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3034 if (increment < 0 && !can_nice(current, nice)) 3080 if (increment < 0 && !can_nice(current, nice))
3035 return -EPERM; 3081 return -EPERM;
3036 3082
@@ -3124,6 +3170,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3124 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3170 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3125 dl_se->dl_throttled = 0; 3171 dl_se->dl_throttled = 0;
3126 dl_se->dl_new = 1; 3172 dl_se->dl_new = 1;
3173 dl_se->dl_yielded = 0;
3127} 3174}
3128 3175
3129static void __setscheduler_params(struct task_struct *p, 3176static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3235,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3188 * We ask for the deadline not being zero, and greater or equal 3235 * We ask for the deadline not being zero, and greater or equal
3189 * than the runtime, as well as the period of being zero or 3236 * than the runtime, as well as the period of being zero or
3190 * greater than deadline. Furthermore, we have to be sure that 3237 * greater than deadline. Furthermore, we have to be sure that
3191 * user parameters are above the internal resolution (1us); we 3238 * user parameters are above the internal resolution of 1us (we
3192 * check sched_runtime only since it is always the smaller one. 3239 * check sched_runtime only since it is always the smaller one) and
3240 * below 2^63 ns (we have to check both sched_deadline and
3241 * sched_period, as the latter can be zero).
3193 */ 3242 */
3194static bool 3243static bool
3195__checkparam_dl(const struct sched_attr *attr) 3244__checkparam_dl(const struct sched_attr *attr)
3196{ 3245{
3197 return attr && attr->sched_deadline != 0 && 3246 /* deadline != 0 */
3198 (attr->sched_period == 0 || 3247 if (attr->sched_deadline == 0)
3199 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3248 return false;
3200 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && 3249
3201 attr->sched_runtime >= (2 << (DL_SCALE - 1)); 3250 /*
3251 * Since we truncate DL_SCALE bits, make sure we're at least
3252 * that big.
3253 */
3254 if (attr->sched_runtime < (1ULL << DL_SCALE))
3255 return false;
3256
3257 /*
3258 * Since we use the MSB for wrap-around and sign issues, make
3259 * sure it's not set (mind that period can be equal to zero).
3260 */
3261 if (attr->sched_deadline & (1ULL << 63) ||
3262 attr->sched_period & (1ULL << 63))
3263 return false;
3264
3265 /* runtime <= deadline <= period (if period != 0) */
3266 if ((attr->sched_period != 0 &&
3267 attr->sched_period < attr->sched_deadline) ||
3268 attr->sched_deadline < attr->sched_runtime)
3269 return false;
3270
3271 return true;
3202} 3272}
3203 3273
3204/* 3274/*
@@ -3596,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3596 */ 3666 */
3597 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3667 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3598 3668
3599out: 3669 return 0;
3600 return ret;
3601 3670
3602err_size: 3671err_size:
3603 put_user(sizeof(*attr), &uattr->size); 3672 put_user(sizeof(*attr), &uattr->size);
3604 ret = -E2BIG; 3673 return -E2BIG;
3605 goto out;
3606} 3674}
3607 3675
3608/** 3676/**
@@ -3639,6 +3707,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3639 * sys_sched_setattr - same as above, but with extended sched_attr 3707 * sys_sched_setattr - same as above, but with extended sched_attr
3640 * @pid: the pid in question. 3708 * @pid: the pid in question.
3641 * @uattr: structure containing the extended parameters. 3709 * @uattr: structure containing the extended parameters.
3710 * @flags: for future extension.
3642 */ 3711 */
3643SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3712SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3644 unsigned int, flags) 3713 unsigned int, flags)
@@ -3650,8 +3719,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3650 if (!uattr || pid < 0 || flags) 3719 if (!uattr || pid < 0 || flags)
3651 return -EINVAL; 3720 return -EINVAL;
3652 3721
3653 if (sched_copy_attr(uattr, &attr)) 3722 retval = sched_copy_attr(uattr, &attr);
3654 return -EFAULT; 3723 if (retval)
3724 return retval;
3725
3726 if ((int)attr.sched_policy < 0)
3727 return -EINVAL;
3655 3728
3656 rcu_read_lock(); 3729 rcu_read_lock();
3657 retval = -ESRCH; 3730 retval = -ESRCH;
@@ -3701,7 +3774,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3701 */ 3774 */
3702SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3775SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3703{ 3776{
3704 struct sched_param lp; 3777 struct sched_param lp = { .sched_priority = 0 };
3705 struct task_struct *p; 3778 struct task_struct *p;
3706 int retval; 3779 int retval;
3707 3780
@@ -3718,11 +3791,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3718 if (retval) 3791 if (retval)
3719 goto out_unlock; 3792 goto out_unlock;
3720 3793
3721 if (task_has_dl_policy(p)) { 3794 if (task_has_rt_policy(p))
3722 retval = -EINVAL; 3795 lp.sched_priority = p->rt_priority;
3723 goto out_unlock;
3724 }
3725 lp.sched_priority = p->rt_priority;
3726 rcu_read_unlock(); 3796 rcu_read_unlock();
3727 3797
3728 /* 3798 /*
@@ -3760,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3760 3830
3761 for (; addr < end; addr++) { 3831 for (; addr < end; addr++) {
3762 if (*addr) 3832 if (*addr)
3763 goto err_size; 3833 return -EFBIG;
3764 } 3834 }
3765 3835
3766 attr->size = usize; 3836 attr->size = usize;
@@ -3770,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3770 if (ret) 3840 if (ret)
3771 return -EFAULT; 3841 return -EFAULT;
3772 3842
3773out: 3843 return 0;
3774 return ret;
3775
3776err_size:
3777 ret = -E2BIG;
3778 goto out;
3779} 3844}
3780 3845
3781/** 3846/**
@@ -3783,6 +3848,7 @@ err_size:
3783 * @pid: the pid in question. 3848 * @pid: the pid in question.
3784 * @uattr: structure containing the extended parameters. 3849 * @uattr: structure containing the extended parameters.
3785 * @size: sizeof(attr) for fwd/bwd comp. 3850 * @size: sizeof(attr) for fwd/bwd comp.
3851 * @flags: for future extension.
3786 */ 3852 */
3787SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3853SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3788 unsigned int, size, unsigned int, flags) 3854 unsigned int, size, unsigned int, flags)
@@ -4051,6 +4117,7 @@ static void __cond_resched(void)
4051 4117
4052int __sched _cond_resched(void) 4118int __sched _cond_resched(void)
4053{ 4119{
4120 rcu_cond_resched();
4054 if (should_resched()) { 4121 if (should_resched()) {
4055 __cond_resched(); 4122 __cond_resched();
4056 return 1; 4123 return 1;
@@ -4069,15 +4136,18 @@ EXPORT_SYMBOL(_cond_resched);
4069 */ 4136 */
4070int __cond_resched_lock(spinlock_t *lock) 4137int __cond_resched_lock(spinlock_t *lock)
4071{ 4138{
4139 bool need_rcu_resched = rcu_should_resched();
4072 int resched = should_resched(); 4140 int resched = should_resched();
4073 int ret = 0; 4141 int ret = 0;
4074 4142
4075 lockdep_assert_held(lock); 4143 lockdep_assert_held(lock);
4076 4144
4077 if (spin_needbreak(lock) || resched) { 4145 if (spin_needbreak(lock) || resched || need_rcu_resched) {
4078 spin_unlock(lock); 4146 spin_unlock(lock);
4079 if (resched) 4147 if (resched)
4080 __cond_resched(); 4148 __cond_resched();
4149 else if (unlikely(need_rcu_resched))
4150 rcu_resched();
4081 else 4151 else
4082 cpu_relax(); 4152 cpu_relax();
4083 ret = 1; 4153 ret = 1;
@@ -4091,6 +4161,7 @@ int __sched __cond_resched_softirq(void)
4091{ 4161{
4092 BUG_ON(!in_softirq()); 4162 BUG_ON(!in_softirq());
4093 4163
4164 rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
4094 if (should_resched()) { 4165 if (should_resched()) {
4095 local_bh_enable(); 4166 local_bh_enable();
4096 __cond_resched(); 4167 __cond_resched();
@@ -5039,11 +5110,20 @@ static struct notifier_block migration_notifier = {
5039 .priority = CPU_PRI_MIGRATION, 5110 .priority = CPU_PRI_MIGRATION,
5040}; 5111};
5041 5112
5113static void __cpuinit set_cpu_rq_start_time(void)
5114{
5115 int cpu = smp_processor_id();
5116 struct rq *rq = cpu_rq(cpu);
5117 rq->age_stamp = sched_clock_cpu(cpu);
5118}
5119
5042static int sched_cpu_active(struct notifier_block *nfb, 5120static int sched_cpu_active(struct notifier_block *nfb,
5043 unsigned long action, void *hcpu) 5121 unsigned long action, void *hcpu)
5044{ 5122{
5045 switch (action & ~CPU_TASKS_FROZEN) { 5123 switch (action & ~CPU_TASKS_FROZEN) {
5046 case CPU_STARTING: 5124 case CPU_STARTING:
5125 set_cpu_rq_start_time();
5126 return NOTIFY_OK;
5047 case CPU_DOWN_FAILED: 5127 case CPU_DOWN_FAILED:
5048 set_cpu_active((long)hcpu, true); 5128 set_cpu_active((long)hcpu, true);
5049 return NOTIFY_OK; 5129 return NOTIFY_OK;
@@ -5252,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
5252 SD_BALANCE_FORK | 5332 SD_BALANCE_FORK |
5253 SD_BALANCE_EXEC | 5333 SD_BALANCE_EXEC |
5254 SD_SHARE_CPUPOWER | 5334 SD_SHARE_CPUPOWER |
5255 SD_SHARE_PKG_RESOURCES)) { 5335 SD_SHARE_PKG_RESOURCES |
5336 SD_SHARE_POWERDOMAIN)) {
5256 if (sd->groups != sd->groups->next) 5337 if (sd->groups != sd->groups->next)
5257 return 0; 5338 return 0;
5258 } 5339 }
@@ -5283,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5283 SD_BALANCE_EXEC | 5364 SD_BALANCE_EXEC |
5284 SD_SHARE_CPUPOWER | 5365 SD_SHARE_CPUPOWER |
5285 SD_SHARE_PKG_RESOURCES | 5366 SD_SHARE_PKG_RESOURCES |
5286 SD_PREFER_SIBLING); 5367 SD_PREFER_SIBLING |
5368 SD_SHARE_POWERDOMAIN);
5287 if (nr_node_ids == 1) 5369 if (nr_node_ids == 1)
5288 pflags &= ~SD_SERIALIZE; 5370 pflags &= ~SD_SERIALIZE;
5289 } 5371 }
@@ -5557,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
5557 5639
5558__setup("isolcpus=", isolated_cpu_setup); 5640__setup("isolcpus=", isolated_cpu_setup);
5559 5641
5560static const struct cpumask *cpu_cpu_mask(int cpu)
5561{
5562 return cpumask_of_node(cpu_to_node(cpu));
5563}
5564
5565struct sd_data {
5566 struct sched_domain **__percpu sd;
5567 struct sched_group **__percpu sg;
5568 struct sched_group_power **__percpu sgp;
5569};
5570
5571struct s_data { 5642struct s_data {
5572 struct sched_domain ** __percpu sd; 5643 struct sched_domain ** __percpu sd;
5573 struct root_domain *rd; 5644 struct root_domain *rd;
@@ -5580,21 +5651,6 @@ enum s_alloc {
5580 sa_none, 5651 sa_none,
5581}; 5652};
5582 5653
5583struct sched_domain_topology_level;
5584
5585typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5586typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5587
5588#define SDTL_OVERLAP 0x01
5589
5590struct sched_domain_topology_level {
5591 sched_domain_init_f init;
5592 sched_domain_mask_f mask;
5593 int flags;
5594 int numa_level;
5595 struct sd_data data;
5596};
5597
5598/* 5654/*
5599 * Build an iteration mask that can exclude certain CPUs from the upwards 5655 * Build an iteration mask that can exclude certain CPUs from the upwards
5600 * domain traversal. 5656 * domain traversal.
@@ -5762,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5762 continue; 5818 continue;
5763 5819
5764 group = get_group(i, sdd, &sg); 5820 group = get_group(i, sdd, &sg);
5765 cpumask_clear(sched_group_cpus(sg));
5766 sg->sgp->power = 0;
5767 cpumask_setall(sched_group_mask(sg)); 5821 cpumask_setall(sched_group_mask(sg));
5768 5822
5769 for_each_cpu(j, span) { 5823 for_each_cpu(j, span) {
@@ -5813,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5813 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5867 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5814} 5868}
5815 5869
5816int __weak arch_sd_sibling_asym_packing(void)
5817{
5818 return 0*SD_ASYM_PACKING;
5819}
5820
5821/* 5870/*
5822 * Initializers for schedule domains 5871 * Initializers for schedule domains
5823 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5872 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5824 */ 5873 */
5825 5874
5826#ifdef CONFIG_SCHED_DEBUG
5827# define SD_INIT_NAME(sd, type) sd->name = #type
5828#else
5829# define SD_INIT_NAME(sd, type) do { } while (0)
5830#endif
5831
5832#define SD_INIT_FUNC(type) \
5833static noinline struct sched_domain * \
5834sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5835{ \
5836 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5837 *sd = SD_##type##_INIT; \
5838 SD_INIT_NAME(sd, type); \
5839 sd->private = &tl->data; \
5840 return sd; \
5841}
5842
5843SD_INIT_FUNC(CPU)
5844#ifdef CONFIG_SCHED_SMT
5845 SD_INIT_FUNC(SIBLING)
5846#endif
5847#ifdef CONFIG_SCHED_MC
5848 SD_INIT_FUNC(MC)
5849#endif
5850#ifdef CONFIG_SCHED_BOOK
5851 SD_INIT_FUNC(BOOK)
5852#endif
5853
5854static int default_relax_domain_level = -1; 5875static int default_relax_domain_level = -1;
5855int sched_domain_level_max; 5876int sched_domain_level_max;
5856 5877
@@ -5938,97 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5938 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5959 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5939} 5960}
5940 5961
5941#ifdef CONFIG_SCHED_SMT
5942static const struct cpumask *cpu_smt_mask(int cpu)
5943{
5944 return topology_thread_cpumask(cpu);
5945}
5946#endif
5947
5948/*
5949 * Topology list, bottom-up.
5950 */
5951static struct sched_domain_topology_level default_topology[] = {
5952#ifdef CONFIG_SCHED_SMT
5953 { sd_init_SIBLING, cpu_smt_mask, },
5954#endif
5955#ifdef CONFIG_SCHED_MC
5956 { sd_init_MC, cpu_coregroup_mask, },
5957#endif
5958#ifdef CONFIG_SCHED_BOOK
5959 { sd_init_BOOK, cpu_book_mask, },
5960#endif
5961 { sd_init_CPU, cpu_cpu_mask, },
5962 { NULL, },
5963};
5964
5965static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5966
5967#define for_each_sd_topology(tl) \
5968 for (tl = sched_domain_topology; tl->init; tl++)
5969
5970#ifdef CONFIG_NUMA 5962#ifdef CONFIG_NUMA
5971
5972static int sched_domains_numa_levels; 5963static int sched_domains_numa_levels;
5973static int *sched_domains_numa_distance; 5964static int *sched_domains_numa_distance;
5974static struct cpumask ***sched_domains_numa_masks; 5965static struct cpumask ***sched_domains_numa_masks;
5975static int sched_domains_curr_level; 5966static int sched_domains_curr_level;
5967#endif
5976 5968
5977static inline int sd_local_flags(int level) 5969/*
5978{ 5970 * SD_flags allowed in topology descriptions.
5979 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5971 *
5980 return 0; 5972 * SD_SHARE_CPUPOWER - describes SMT topologies
5981 5973 * SD_SHARE_PKG_RESOURCES - describes shared caches
5982 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5974 * SD_NUMA - describes NUMA topologies
5983} 5975 * SD_SHARE_POWERDOMAIN - describes shared power domain
5976 *
5977 * Odd one out:
5978 * SD_ASYM_PACKING - describes SMT quirks
5979 */
5980#define TOPOLOGY_SD_FLAGS \
5981 (SD_SHARE_CPUPOWER | \
5982 SD_SHARE_PKG_RESOURCES | \
5983 SD_NUMA | \
5984 SD_ASYM_PACKING | \
5985 SD_SHARE_POWERDOMAIN)
5984 5986
5985static struct sched_domain * 5987static struct sched_domain *
5986sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5988sd_init(struct sched_domain_topology_level *tl, int cpu)
5987{ 5989{
5988 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5990 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5989 int level = tl->numa_level; 5991 int sd_weight, sd_flags = 0;
5990 int sd_weight = cpumask_weight( 5992
5991 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5993#ifdef CONFIG_NUMA
5994 /*
5995 * Ugly hack to pass state to sd_numa_mask()...
5996 */
5997 sched_domains_curr_level = tl->numa_level;
5998#endif
5999
6000 sd_weight = cpumask_weight(tl->mask(cpu));
6001
6002 if (tl->sd_flags)
6003 sd_flags = (*tl->sd_flags)();
6004 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6005 "wrong sd_flags in topology description\n"))
6006 sd_flags &= ~TOPOLOGY_SD_FLAGS;
5992 6007
5993 *sd = (struct sched_domain){ 6008 *sd = (struct sched_domain){
5994 .min_interval = sd_weight, 6009 .min_interval = sd_weight,
5995 .max_interval = 2*sd_weight, 6010 .max_interval = 2*sd_weight,
5996 .busy_factor = 32, 6011 .busy_factor = 32,
5997 .imbalance_pct = 125, 6012 .imbalance_pct = 125,
5998 .cache_nice_tries = 2, 6013
5999 .busy_idx = 3, 6014 .cache_nice_tries = 0,
6000 .idle_idx = 2, 6015 .busy_idx = 0,
6016 .idle_idx = 0,
6001 .newidle_idx = 0, 6017 .newidle_idx = 0,
6002 .wake_idx = 0, 6018 .wake_idx = 0,
6003 .forkexec_idx = 0, 6019 .forkexec_idx = 0,
6004 6020
6005 .flags = 1*SD_LOAD_BALANCE 6021 .flags = 1*SD_LOAD_BALANCE
6006 | 1*SD_BALANCE_NEWIDLE 6022 | 1*SD_BALANCE_NEWIDLE
6007 | 0*SD_BALANCE_EXEC 6023 | 1*SD_BALANCE_EXEC
6008 | 0*SD_BALANCE_FORK 6024 | 1*SD_BALANCE_FORK
6009 | 0*SD_BALANCE_WAKE 6025 | 0*SD_BALANCE_WAKE
6010 | 0*SD_WAKE_AFFINE 6026 | 1*SD_WAKE_AFFINE
6011 | 0*SD_SHARE_CPUPOWER 6027 | 0*SD_SHARE_CPUPOWER
6012 | 0*SD_SHARE_PKG_RESOURCES 6028 | 0*SD_SHARE_PKG_RESOURCES
6013 | 1*SD_SERIALIZE 6029 | 0*SD_SERIALIZE
6014 | 0*SD_PREFER_SIBLING 6030 | 0*SD_PREFER_SIBLING
6015 | 1*SD_NUMA 6031 | 0*SD_NUMA
6016 | sd_local_flags(level) 6032 | sd_flags
6017 , 6033 ,
6034
6018 .last_balance = jiffies, 6035 .last_balance = jiffies,
6019 .balance_interval = sd_weight, 6036 .balance_interval = sd_weight,
6037 .smt_gain = 0,
6038 .max_newidle_lb_cost = 0,
6039 .next_decay_max_lb_cost = jiffies,
6040#ifdef CONFIG_SCHED_DEBUG
6041 .name = tl->name,
6042#endif
6020 }; 6043 };
6021 SD_INIT_NAME(sd, NUMA);
6022 sd->private = &tl->data;
6023 6044
6024 /* 6045 /*
6025 * Ugly hack to pass state to sd_numa_mask()... 6046 * Convert topological properties into behaviour.
6026 */ 6047 */
6027 sched_domains_curr_level = tl->numa_level; 6048
6049 if (sd->flags & SD_SHARE_CPUPOWER) {
6050 sd->imbalance_pct = 110;
6051 sd->smt_gain = 1178; /* ~15% */
6052
6053 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6054 sd->imbalance_pct = 117;
6055 sd->cache_nice_tries = 1;
6056 sd->busy_idx = 2;
6057
6058#ifdef CONFIG_NUMA
6059 } else if (sd->flags & SD_NUMA) {
6060 sd->cache_nice_tries = 2;
6061 sd->busy_idx = 3;
6062 sd->idle_idx = 2;
6063
6064 sd->flags |= SD_SERIALIZE;
6065 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6066 sd->flags &= ~(SD_BALANCE_EXEC |
6067 SD_BALANCE_FORK |
6068 SD_WAKE_AFFINE);
6069 }
6070
6071#endif
6072 } else {
6073 sd->flags |= SD_PREFER_SIBLING;
6074 sd->cache_nice_tries = 1;
6075 sd->busy_idx = 2;
6076 sd->idle_idx = 1;
6077 }
6078
6079 sd->private = &tl->data;
6028 6080
6029 return sd; 6081 return sd;
6030} 6082}
6031 6083
6084/*
6085 * Topology list, bottom-up.
6086 */
6087static struct sched_domain_topology_level default_topology[] = {
6088#ifdef CONFIG_SCHED_SMT
6089 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6090#endif
6091#ifdef CONFIG_SCHED_MC
6092 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6093#endif
6094 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6095 { NULL, },
6096};
6097
6098struct sched_domain_topology_level *sched_domain_topology = default_topology;
6099
6100#define for_each_sd_topology(tl) \
6101 for (tl = sched_domain_topology; tl->mask; tl++)
6102
6103void set_sched_topology(struct sched_domain_topology_level *tl)
6104{
6105 sched_domain_topology = tl;
6106}
6107
6108#ifdef CONFIG_NUMA
6109
6032static const struct cpumask *sd_numa_mask(int cpu) 6110static const struct cpumask *sd_numa_mask(int cpu)
6033{ 6111{
6034 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6112 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6172,7 +6250,10 @@ static void sched_init_numa(void)
6172 } 6250 }
6173 } 6251 }
6174 6252
6175 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6253 /* Compute default topology size */
6254 for (i = 0; sched_domain_topology[i].mask; i++);
6255
6256 tl = kzalloc((i + level + 1) *
6176 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6257 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6177 if (!tl) 6258 if (!tl)
6178 return; 6259 return;
@@ -6180,18 +6261,19 @@ static void sched_init_numa(void)
6180 /* 6261 /*
6181 * Copy the default topology bits.. 6262 * Copy the default topology bits..
6182 */ 6263 */
6183 for (i = 0; default_topology[i].init; i++) 6264 for (i = 0; sched_domain_topology[i].mask; i++)
6184 tl[i] = default_topology[i]; 6265 tl[i] = sched_domain_topology[i];
6185 6266
6186 /* 6267 /*
6187 * .. and append 'j' levels of NUMA goodness. 6268 * .. and append 'j' levels of NUMA goodness.
6188 */ 6269 */
6189 for (j = 0; j < level; i++, j++) { 6270 for (j = 0; j < level; i++, j++) {
6190 tl[i] = (struct sched_domain_topology_level){ 6271 tl[i] = (struct sched_domain_topology_level){
6191 .init = sd_numa_init,
6192 .mask = sd_numa_mask, 6272 .mask = sd_numa_mask,
6273 .sd_flags = cpu_numa_flags,
6193 .flags = SDTL_OVERLAP, 6274 .flags = SDTL_OVERLAP,
6194 .numa_level = j, 6275 .numa_level = j,
6276 SD_INIT_NAME(NUMA)
6195 }; 6277 };
6196 } 6278 }
6197 6279
@@ -6349,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6349 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6431 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6350 struct sched_domain *child, int cpu) 6432 struct sched_domain *child, int cpu)
6351{ 6433{
6352 struct sched_domain *sd = tl->init(tl, cpu); 6434 struct sched_domain *sd = sd_init(tl, cpu);
6353 if (!sd) 6435 if (!sd)
6354 return child; 6436 return child;
6355 6437
@@ -6919,6 +7001,7 @@ void __init sched_init(void)
6919 if (cpu_isolated_map == NULL) 7001 if (cpu_isolated_map == NULL)
6920 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7002 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6921 idle_thread_set_boot_cpu(); 7003 idle_thread_set_boot_cpu();
7004 set_cpu_rq_start_time();
6922#endif 7005#endif
6923 init_sched_fair_class(); 7006 init_sched_fair_class();
6924 7007
@@ -7586,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7586static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7669static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7587{ 7670{
7588 struct task_group *tg = css_tg(css); 7671 struct task_group *tg = css_tg(css);
7589 struct task_group *parent = css_tg(css_parent(css)); 7672 struct task_group *parent = css_tg(css->parent);
7590 7673
7591 if (parent) 7674 if (parent)
7592 sched_online_group(tg, parent); 7675 sched_online_group(tg, parent);
@@ -7717,8 +7800,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7717 /* restart the period timer (if active) to handle new period expiry */ 7800 /* restart the period timer (if active) to handle new period expiry */
7718 if (runtime_enabled && cfs_b->timer_active) { 7801 if (runtime_enabled && cfs_b->timer_active) {
7719 /* force a reprogram */ 7802 /* force a reprogram */
7720 cfs_b->timer_active = 0; 7803 __start_cfs_bandwidth(cfs_b, true);
7721 __start_cfs_bandwidth(cfs_b);
7722 } 7804 }
7723 raw_spin_unlock_irq(&cfs_b->lock); 7805 raw_spin_unlock_irq(&cfs_b->lock);
7724 7806