aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c211
1 files changed, 169 insertions, 42 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..ad1962dc0aa2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 205}
206 206
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 207static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -600,7 +600,6 @@ struct rq {
600 /* BKL stats */ 600 /* BKL stats */
601 unsigned int bkl_count; 601 unsigned int bkl_count;
602#endif 602#endif
603 struct lock_class_key rq_lock_key;
604}; 603};
605 604
606static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -809,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
809 808
810/* 809/*
811 * ratelimit for updating the group shares. 810 * ratelimit for updating the group shares.
812 * default: 0.5ms 811 * default: 0.25ms
813 */ 812 */
814const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; 813unsigned int sysctl_sched_shares_ratelimit = 250000;
815 814
816/* 815/*
817 * period over which we measure -rt task cpu usage in us. 816 * period over which we measure -rt task cpu usage in us.
@@ -834,7 +833,7 @@ static inline u64 global_rt_period(void)
834 833
835static inline u64 global_rt_runtime(void) 834static inline u64 global_rt_runtime(void)
836{ 835{
837 if (sysctl_sched_rt_period < 0) 836 if (sysctl_sched_rt_runtime < 0)
838 return RUNTIME_INF; 837 return RUNTIME_INF;
839 838
840 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 839 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -1088,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1088 return NOTIFY_DONE; 1087 return NOTIFY_DONE;
1089} 1088}
1090 1089
1091static void init_hrtick(void) 1090static __init void init_hrtick(void)
1092{ 1091{
1093 hotcpu_notifier(hotplug_hrtick, 0); 1092 hotcpu_notifier(hotplug_hrtick, 0);
1094} 1093}
@@ -1120,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq)
1120 1119
1121 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1122 rq->hrtick_timer.function = hrtick; 1121 rq->hrtick_timer.function = hrtick;
1123 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1124} 1123}
1125#else 1124#else
1126static inline void hrtick_clear(struct rq *rq) 1125static inline void hrtick_clear(struct rq *rq)
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2759 } else { 2758 } else {
2760 if (rq1 < rq2) { 2759 if (rq1 < rq2) {
2761 spin_lock(&rq1->lock); 2760 spin_lock(&rq1->lock);
2762 spin_lock(&rq2->lock); 2761 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2763 } else { 2762 } else {
2764 spin_lock(&rq2->lock); 2763 spin_lock(&rq2->lock);
2765 spin_lock(&rq1->lock); 2764 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2766 } 2765 }
2767 } 2766 }
2768 update_rq_clock(rq1); 2767 update_rq_clock(rq1);
@@ -2805,14 +2804,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2805 if (busiest < this_rq) { 2804 if (busiest < this_rq) {
2806 spin_unlock(&this_rq->lock); 2805 spin_unlock(&this_rq->lock);
2807 spin_lock(&busiest->lock); 2806 spin_lock(&busiest->lock);
2808 spin_lock(&this_rq->lock); 2807 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2809 ret = 1; 2808 ret = 1;
2810 } else 2809 } else
2811 spin_lock(&busiest->lock); 2810 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2812 } 2811 }
2813 return ret; 2812 return ret;
2814} 2813}
2815 2814
2815static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2816 __releases(busiest->lock)
2817{
2818 spin_unlock(&busiest->lock);
2819 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2820}
2821
2816/* 2822/*
2817 * If dest_cpu is allowed for this process, migrate the task to it. 2823 * If dest_cpu is allowed for this process, migrate the task to it.
2818 * This is accomplished by forcing the cpu_allowed mask to only 2824 * This is accomplished by forcing the cpu_allowed mask to only
@@ -3637,7 +3643,7 @@ redo:
3637 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3643 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3638 imbalance, sd, CPU_NEWLY_IDLE, 3644 imbalance, sd, CPU_NEWLY_IDLE,
3639 &all_pinned); 3645 &all_pinned);
3640 spin_unlock(&busiest->lock); 3646 double_unlock_balance(this_rq, busiest);
3641 3647
3642 if (unlikely(all_pinned)) { 3648 if (unlikely(all_pinned)) {
3643 cpu_clear(cpu_of(busiest), *cpus); 3649 cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3752 else 3758 else
3753 schedstat_inc(sd, alb_failed); 3759 schedstat_inc(sd, alb_failed);
3754 } 3760 }
3755 spin_unlock(&target_rq->lock); 3761 double_unlock_balance(busiest_rq, target_rq);
3756} 3762}
3757 3763
3758#ifdef CONFIG_NO_HZ 3764#ifdef CONFIG_NO_HZ
@@ -4173,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4173} 4179}
4174 4180
4175/* 4181/*
4182 * Use precise platform statistics if available:
4183 */
4184#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4185cputime_t task_utime(struct task_struct *p)
4186{
4187 return p->utime;
4188}
4189
4190cputime_t task_stime(struct task_struct *p)
4191{
4192 return p->stime;
4193}
4194#else
4195cputime_t task_utime(struct task_struct *p)
4196{
4197 clock_t utime = cputime_to_clock_t(p->utime),
4198 total = utime + cputime_to_clock_t(p->stime);
4199 u64 temp;
4200
4201 /*
4202 * Use CFS's precise accounting:
4203 */
4204 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4205
4206 if (total) {
4207 temp *= utime;
4208 do_div(temp, total);
4209 }
4210 utime = (clock_t)temp;
4211
4212 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4213 return p->prev_utime;
4214}
4215
4216cputime_t task_stime(struct task_struct *p)
4217{
4218 clock_t stime;
4219
4220 /*
4221 * Use CFS's precise accounting. (we subtract utime from
4222 * the total, to make sure the total observed by userspace
4223 * grows monotonically - apps rely on that):
4224 */
4225 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4226 cputime_to_clock_t(task_utime(p));
4227
4228 if (stime >= 0)
4229 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4230
4231 return p->prev_stime;
4232}
4233#endif
4234
4235inline cputime_t task_gtime(struct task_struct *p)
4236{
4237 return p->gtime;
4238}
4239
4240/*
4176 * This function gets called by the timer code, with HZ frequency. 4241 * This function gets called by the timer code, with HZ frequency.
4177 * We call it with interrupts disabled. 4242 * We call it with interrupts disabled.
4178 * 4243 *
@@ -4663,6 +4728,52 @@ int __sched wait_for_completion_killable(struct completion *x)
4663} 4728}
4664EXPORT_SYMBOL(wait_for_completion_killable); 4729EXPORT_SYMBOL(wait_for_completion_killable);
4665 4730
4731/**
4732 * try_wait_for_completion - try to decrement a completion without blocking
4733 * @x: completion structure
4734 *
4735 * Returns: 0 if a decrement cannot be done without blocking
4736 * 1 if a decrement succeeded.
4737 *
4738 * If a completion is being used as a counting completion,
4739 * attempt to decrement the counter without blocking. This
4740 * enables us to avoid waiting if the resource the completion
4741 * is protecting is not available.
4742 */
4743bool try_wait_for_completion(struct completion *x)
4744{
4745 int ret = 1;
4746
4747 spin_lock_irq(&x->wait.lock);
4748 if (!x->done)
4749 ret = 0;
4750 else
4751 x->done--;
4752 spin_unlock_irq(&x->wait.lock);
4753 return ret;
4754}
4755EXPORT_SYMBOL(try_wait_for_completion);
4756
4757/**
4758 * completion_done - Test to see if a completion has any waiters
4759 * @x: completion structure
4760 *
4761 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4762 * 1 if there are no waiters.
4763 *
4764 */
4765bool completion_done(struct completion *x)
4766{
4767 int ret = 1;
4768
4769 spin_lock_irq(&x->wait.lock);
4770 if (!x->done)
4771 ret = 0;
4772 spin_unlock_irq(&x->wait.lock);
4773 return ret;
4774}
4775EXPORT_SYMBOL(completion_done);
4776
4666static long __sched 4777static long __sched
4667sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4778sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4668{ 4779{
@@ -5004,19 +5115,21 @@ recheck:
5004 return -EPERM; 5115 return -EPERM;
5005 } 5116 }
5006 5117
5118 if (user) {
5007#ifdef CONFIG_RT_GROUP_SCHED 5119#ifdef CONFIG_RT_GROUP_SCHED
5008 /* 5120 /*
5009 * Do not allow realtime tasks into groups that have no runtime 5121 * Do not allow realtime tasks into groups that have no runtime
5010 * assigned. 5122 * assigned.
5011 */ 5123 */
5012 if (user 5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5013 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5125 return -EPERM;
5014 return -EPERM;
5015#endif 5126#endif
5016 5127
5017 retval = security_task_setscheduler(p, policy, param); 5128 retval = security_task_setscheduler(p, policy, param);
5018 if (retval) 5129 if (retval)
5019 return retval; 5130 return retval;
5131 }
5132
5020 /* 5133 /*
5021 * make sure no PI-waiters arrive (or leave) while we are 5134 * make sure no PI-waiters arrive (or leave) while we are
5022 * changing the priority of the task: 5135 * changing the priority of the task:
@@ -5732,6 +5845,8 @@ static inline void sched_init_granularity(void)
5732 sysctl_sched_latency = limit; 5845 sysctl_sched_latency = limit;
5733 5846
5734 sysctl_sched_wakeup_granularity *= factor; 5847 sysctl_sched_wakeup_granularity *= factor;
5848
5849 sysctl_sched_shares_ratelimit *= factor;
5735} 5850}
5736 5851
5737#ifdef CONFIG_SMP 5852#ifdef CONFIG_SMP
@@ -7581,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7581 * and partition_sched_domains() will fallback to the single partition 7696 * and partition_sched_domains() will fallback to the single partition
7582 * 'fallback_doms', it also forces the domains to be rebuilt. 7697 * 'fallback_doms', it also forces the domains to be rebuilt.
7583 * 7698 *
7699 * If doms_new==NULL it will be replaced with cpu_online_map.
7700 * ndoms_new==0 is a special case for destroying existing domains.
7701 * It will not create the default domain.
7702 *
7584 * Call with hotplug lock held 7703 * Call with hotplug lock held
7585 */ 7704 */
7586void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7705void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7587 struct sched_domain_attr *dattr_new) 7706 struct sched_domain_attr *dattr_new)
7588{ 7707{
7589 int i, j; 7708 int i, j, n;
7590 7709
7591 mutex_lock(&sched_domains_mutex); 7710 mutex_lock(&sched_domains_mutex);
7592 7711
7593 /* always unregister in case we don't destroy any domains */ 7712 /* always unregister in case we don't destroy any domains */
7594 unregister_sched_domain_sysctl(); 7713 unregister_sched_domain_sysctl();
7595 7714
7596 if (doms_new == NULL) 7715 n = doms_new ? ndoms_new : 0;
7597 ndoms_new = 0;
7598 7716
7599 /* Destroy deleted domains */ 7717 /* Destroy deleted domains */
7600 for (i = 0; i < ndoms_cur; i++) { 7718 for (i = 0; i < ndoms_cur; i++) {
7601 for (j = 0; j < ndoms_new; j++) { 7719 for (j = 0; j < n; j++) {
7602 if (cpus_equal(doms_cur[i], doms_new[j]) 7720 if (cpus_equal(doms_cur[i], doms_new[j])
7603 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7721 && dattrs_equal(dattr_cur, i, dattr_new, j))
7604 goto match1; 7722 goto match1;
@@ -7611,7 +7729,6 @@ match1:
7611 7729
7612 if (doms_new == NULL) { 7730 if (doms_new == NULL) {
7613 ndoms_cur = 0; 7731 ndoms_cur = 0;
7614 ndoms_new = 1;
7615 doms_new = &fallback_doms; 7732 doms_new = &fallback_doms;
7616 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7733 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7617 dattr_new = NULL; 7734 dattr_new = NULL;
@@ -7648,8 +7765,13 @@ match2:
7648int arch_reinit_sched_domains(void) 7765int arch_reinit_sched_domains(void)
7649{ 7766{
7650 get_online_cpus(); 7767 get_online_cpus();
7768
7769 /* Destroy domains first to force the rebuild */
7770 partition_sched_domains(0, NULL, NULL);
7771
7651 rebuild_sched_domains(); 7772 rebuild_sched_domains();
7652 put_online_cpus(); 7773 put_online_cpus();
7774
7653 return 0; 7775 return 0;
7654} 7776}
7655 7777
@@ -7671,34 +7793,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7671} 7793}
7672 7794
7673#ifdef CONFIG_SCHED_MC 7795#ifdef CONFIG_SCHED_MC
7674static ssize_t sched_mc_power_savings_show(struct sys_device *dev, 7796static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7675 struct sysdev_attribute *attr, char *page) 7797 char *page)
7676{ 7798{
7677 return sprintf(page, "%u\n", sched_mc_power_savings); 7799 return sprintf(page, "%u\n", sched_mc_power_savings);
7678} 7800}
7679static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 7801static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7680 struct sysdev_attribute *attr,
7681 const char *buf, size_t count) 7802 const char *buf, size_t count)
7682{ 7803{
7683 return sched_power_savings_store(buf, count, 0); 7804 return sched_power_savings_store(buf, count, 0);
7684} 7805}
7685static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 7806static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7686 sched_mc_power_savings_store); 7807 sched_mc_power_savings_show,
7808 sched_mc_power_savings_store);
7687#endif 7809#endif
7688 7810
7689#ifdef CONFIG_SCHED_SMT 7811#ifdef CONFIG_SCHED_SMT
7690static ssize_t sched_smt_power_savings_show(struct sys_device *dev, 7812static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7691 struct sysdev_attribute *attr, char *page) 7813 char *page)
7692{ 7814{
7693 return sprintf(page, "%u\n", sched_smt_power_savings); 7815 return sprintf(page, "%u\n", sched_smt_power_savings);
7694} 7816}
7695static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 7817static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7696 struct sysdev_attribute *attr,
7697 const char *buf, size_t count) 7818 const char *buf, size_t count)
7698{ 7819{
7699 return sched_power_savings_store(buf, count, 1); 7820 return sched_power_savings_store(buf, count, 1);
7700} 7821}
7701static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 7822static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7823 sched_smt_power_savings_show,
7702 sched_smt_power_savings_store); 7824 sched_smt_power_savings_store);
7703#endif 7825#endif
7704 7826
@@ -7733,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7733 case CPU_ONLINE_FROZEN: 7855 case CPU_ONLINE_FROZEN:
7734 case CPU_DEAD: 7856 case CPU_DEAD:
7735 case CPU_DEAD_FROZEN: 7857 case CPU_DEAD_FROZEN:
7736 partition_sched_domains(0, NULL, NULL); 7858 partition_sched_domains(1, NULL, NULL);
7737 return NOTIFY_OK; 7859 return NOTIFY_OK;
7738 7860
7739 default: 7861 default:
@@ -7998,7 +8120,6 @@ void __init sched_init(void)
7998 8120
7999 rq = cpu_rq(i); 8121 rq = cpu_rq(i);
8000 spin_lock_init(&rq->lock); 8122 spin_lock_init(&rq->lock);
8001 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8002 rq->nr_running = 0; 8123 rq->nr_running = 0;
8003 init_cfs_rq(&rq->cfs, rq); 8124 init_cfs_rq(&rq->cfs, rq);
8004 init_rt_rq(&rq->rt, rq); 8125 init_rt_rq(&rq->rt, rq);
@@ -8455,8 +8576,8 @@ struct task_group *sched_create_group(struct task_group *parent)
8455 WARN_ON(!parent); /* root should already exist */ 8576 WARN_ON(!parent); /* root should already exist */
8456 8577
8457 tg->parent = parent; 8578 tg->parent = parent;
8458 list_add_rcu(&tg->siblings, &parent->children);
8459 INIT_LIST_HEAD(&tg->children); 8579 INIT_LIST_HEAD(&tg->children);
8580 list_add_rcu(&tg->siblings, &parent->children);
8460 spin_unlock_irqrestore(&task_group_lock, flags); 8581 spin_unlock_irqrestore(&task_group_lock, flags);
8461 8582
8462 return tg; 8583 return tg;
@@ -8788,6 +8909,9 @@ static int sched_rt_global_constraints(void)
8788 u64 rt_runtime, rt_period; 8909 u64 rt_runtime, rt_period;
8789 int ret = 0; 8910 int ret = 0;
8790 8911
8912 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL;
8914
8791 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8792 rt_runtime = tg->rt_bandwidth.rt_runtime; 8916 rt_runtime = tg->rt_bandwidth.rt_runtime;
8793 8917
@@ -8804,6 +8928,9 @@ static int sched_rt_global_constraints(void)
8804 unsigned long flags; 8928 unsigned long flags;
8805 int i; 8929 int i;
8806 8930
8931 if (sysctl_sched_rt_period <= 0)
8932 return -EINVAL;
8933
8807 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8934 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8808 for_each_possible_cpu(i) { 8935 for_each_possible_cpu(i) {
8809 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8936 struct rt_rq *rt_rq = &cpu_rq(i)->rt;