aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/sched/sysctl.h13
-rw-r--r--kernel/sched/core.c441
-rw-r--r--kernel/sched/deadline.c46
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/sysctl.c14
6 files changed, 555 insertions, 36 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13c53a99920f..a196cb7fc6f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1104,6 +1104,7 @@ struct sched_dl_entity {
1104 u64 dl_runtime; /* maximum runtime for each instance */ 1104 u64 dl_runtime; /* maximum runtime for each instance */
1105 u64 dl_deadline; /* relative deadline of each instance */ 1105 u64 dl_deadline; /* relative deadline of each instance */
1106 u64 dl_period; /* separation of two instances (period) */ 1106 u64 dl_period; /* separation of two instances (period) */
1107 u64 dl_bw; /* dl_runtime / dl_deadline */
1107 1108
1108 /* 1109 /*
1109 * Actual scheduling parameters. Initialized with the values above, 1110 * Actual scheduling parameters. Initialized with the values above,
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 31e0193cb0c5..8070a83dbedc 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -81,6 +81,15 @@ static inline unsigned int get_sysctl_timer_migration(void)
81extern unsigned int sysctl_sched_rt_period; 81extern unsigned int sysctl_sched_rt_period;
82extern int sysctl_sched_rt_runtime; 82extern int sysctl_sched_rt_runtime;
83 83
84/*
85 * control SCHED_DEADLINE reservations:
86 *
87 * /proc/sys/kernel/sched_dl_period_us
88 * /proc/sys/kernel/sched_dl_runtime_us
89 */
90extern unsigned int sysctl_sched_dl_period;
91extern int sysctl_sched_dl_runtime;
92
84#ifdef CONFIG_CFS_BANDWIDTH 93#ifdef CONFIG_CFS_BANDWIDTH
85extern unsigned int sysctl_sched_cfs_bandwidth_slice; 94extern unsigned int sysctl_sched_cfs_bandwidth_slice;
86#endif 95#endif
@@ -99,4 +108,8 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
99 void __user *buffer, size_t *lenp, 108 void __user *buffer, size_t *lenp,
100 loff_t *ppos); 109 loff_t *ppos);
101 110
111int sched_dl_handler(struct ctl_table *table, int write,
112 void __user *buffer, size_t *lenp,
113 loff_t *ppos);
114
102#endif /* _SCHED_SYSCTL_H */ 115#endif /* _SCHED_SYSCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 599ee3b11b44..c7c68e6b5c51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,6 +296,15 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299/*
300 * Maximum bandwidth available for all -deadline tasks and groups
301 * (if group scheduling is configured) on each CPU.
302 *
303 * default: 5%
304 */
305unsigned int sysctl_sched_dl_period = 1000000;
306int sysctl_sched_dl_runtime = 50000;
307
299 308
300 309
301/* 310/*
@@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
1856 return 0; 1865 return 0;
1857} 1866}
1858 1867
1868unsigned long to_ratio(u64 period, u64 runtime)
1869{
1870 if (runtime == RUNTIME_INF)
1871 return 1ULL << 20;
1872
1873 /*
1874 * Doing this here saves a lot of checks in all
1875 * the calling paths, and returning zero seems
1876 * safe for them anyway.
1877 */
1878 if (period == 0)
1879 return 0;
1880
1881 return div64_u64(runtime << 20, period);
1882}
1883
1884#ifdef CONFIG_SMP
1885inline struct dl_bw *dl_bw_of(int i)
1886{
1887 return &cpu_rq(i)->rd->dl_bw;
1888}
1889
1890static inline int __dl_span_weight(struct rq *rq)
1891{
1892 return cpumask_weight(rq->rd->span);
1893}
1894#else
1895inline struct dl_bw *dl_bw_of(int i)
1896{
1897 return &cpu_rq(i)->dl.dl_bw;
1898}
1899
1900static inline int __dl_span_weight(struct rq *rq)
1901{
1902 return 1;
1903}
1904#endif
1905
1906static inline
1907void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1908{
1909 dl_b->total_bw -= tsk_bw;
1910}
1911
1912static inline
1913void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1914{
1915 dl_b->total_bw += tsk_bw;
1916}
1917
1918static inline
1919bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1920{
1921 return dl_b->bw != -1 &&
1922 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1923}
1924
1925/*
1926 * We must be sure that accepting a new task (or allowing changing the
1927 * parameters of an existing one) is consistent with the bandwidth
1928 * constraints. If yes, this function also accordingly updates the currently
1929 * allocated bandwidth to reflect the new situation.
1930 *
1931 * This function is called while holding p's rq->lock.
1932 */
1933static int dl_overflow(struct task_struct *p, int policy,
1934 const struct sched_attr *attr)
1935{
1936
1937 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1938 u64 period = attr->sched_period;
1939 u64 runtime = attr->sched_runtime;
1940 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1941 int cpus = __dl_span_weight(task_rq(p));
1942 int err = -1;
1943
1944 if (new_bw == p->dl.dl_bw)
1945 return 0;
1946
1947 /*
1948 * Either if a task, enters, leave, or stays -deadline but changes
1949 * its parameters, we may need to update accordingly the total
1950 * allocated bandwidth of the container.
1951 */
1952 raw_spin_lock(&dl_b->lock);
1953 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1954 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1955 __dl_add(dl_b, new_bw);
1956 err = 0;
1957 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1958 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1959 __dl_clear(dl_b, p->dl.dl_bw);
1960 __dl_add(dl_b, new_bw);
1961 err = 0;
1962 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1963 __dl_clear(dl_b, p->dl.dl_bw);
1964 err = 0;
1965 }
1966 raw_spin_unlock(&dl_b->lock);
1967
1968 return err;
1969}
1970
1971extern void init_dl_bw(struct dl_bw *dl_b);
1972
1859/* 1973/*
1860 * wake_up_new_task - wake up a newly created task for the first time. 1974 * wake_up_new_task - wake up a newly created task for the first time.
1861 * 1975 *
@@ -3053,6 +3167,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3053 dl_se->dl_deadline = attr->sched_deadline; 3167 dl_se->dl_deadline = attr->sched_deadline;
3054 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3168 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3055 dl_se->flags = attr->sched_flags; 3169 dl_se->flags = attr->sched_flags;
3170 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3056 dl_se->dl_throttled = 0; 3171 dl_se->dl_throttled = 0;
3057 dl_se->dl_new = 1; 3172 dl_se->dl_new = 1;
3058} 3173}
@@ -3101,7 +3216,9 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3101 * This function validates the new parameters of a -deadline task. 3216 * This function validates the new parameters of a -deadline task.
3102 * We ask for the deadline not being zero, and greater or equal 3217 * We ask for the deadline not being zero, and greater or equal
3103 * than the runtime, as well as the period of being zero or 3218 * than the runtime, as well as the period of being zero or
3104 * greater than deadline. 3219 * greater than deadline. Furthermore, we have to be sure that
3220 * user parameters are above the internal resolution (1us); we
3221 * check sched_runtime only since it is always the smaller one.
3105 */ 3222 */
3106static bool 3223static bool
3107__checkparam_dl(const struct sched_attr *attr) 3224__checkparam_dl(const struct sched_attr *attr)
@@ -3109,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr)
3109 return attr && attr->sched_deadline != 0 && 3226 return attr && attr->sched_deadline != 0 &&
3110 (attr->sched_period == 0 || 3227 (attr->sched_period == 0 ||
3111 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3228 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3112 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0; 3229 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3230 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3113} 3231}
3114 3232
3115/* 3233/*
@@ -3250,8 +3368,8 @@ recheck:
3250 } 3368 }
3251change: 3369change:
3252 3370
3253#ifdef CONFIG_RT_GROUP_SCHED
3254 if (user) { 3371 if (user) {
3372#ifdef CONFIG_RT_GROUP_SCHED
3255 /* 3373 /*
3256 * Do not allow realtime tasks into groups that have no runtime 3374 * Do not allow realtime tasks into groups that have no runtime
3257 * assigned. 3375 * assigned.
@@ -3262,8 +3380,33 @@ change:
3262 task_rq_unlock(rq, p, &flags); 3380 task_rq_unlock(rq, p, &flags);
3263 return -EPERM; 3381 return -EPERM;
3264 } 3382 }
3265 }
3266#endif 3383#endif
3384#ifdef CONFIG_SMP
3385 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3386 cpumask_t *span = rq->rd->span;
3387 cpumask_t act_affinity;
3388
3389 /*
3390 * cpus_allowed mask is statically initialized with
3391 * CPU_MASK_ALL, span is instead dynamic. Here we
3392 * compute the "dynamic" affinity of a task.
3393 */
3394 cpumask_and(&act_affinity, &p->cpus_allowed,
3395 cpu_active_mask);
3396
3397 /*
3398 * Don't allow tasks with an affinity mask smaller than
3399 * the entire root_domain to become SCHED_DEADLINE. We
3400 * will also fail if there's no bandwidth available.
3401 */
3402 if (!cpumask_equal(&act_affinity, span) ||
3403 rq->rd->dl_bw.bw == 0) {
3404 task_rq_unlock(rq, p, &flags);
3405 return -EPERM;
3406 }
3407 }
3408#endif
3409 }
3267 3410
3268 /* recheck policy now with rq lock held */ 3411 /* recheck policy now with rq lock held */
3269 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3412 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3271,6 +3414,18 @@ change:
3271 task_rq_unlock(rq, p, &flags); 3414 task_rq_unlock(rq, p, &flags);
3272 goto recheck; 3415 goto recheck;
3273 } 3416 }
3417
3418 /*
3419 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3420 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3421 * is available.
3422 */
3423 if ((dl_policy(policy) || dl_task(p)) &&
3424 dl_overflow(p, policy, attr)) {
3425 task_rq_unlock(rq, p, &flags);
3426 return -EBUSY;
3427 }
3428
3274 on_rq = p->on_rq; 3429 on_rq = p->on_rq;
3275 running = task_current(rq, p); 3430 running = task_current(rq, p);
3276 if (on_rq) 3431 if (on_rq)
@@ -3705,6 +3860,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3705 if (retval) 3860 if (retval)
3706 goto out_unlock; 3861 goto out_unlock;
3707 3862
3863 /*
3864 * Since bandwidth control happens on root_domain basis,
3865 * if admission test is enabled, we only admit -deadline
3866 * tasks allowed to run on all the CPUs in the task's
3867 * root_domain.
3868 */
3869#ifdef CONFIG_SMP
3870 if (task_has_dl_policy(p)) {
3871 const struct cpumask *span = task_rq(p)->rd->span;
3872
3873 if (dl_bandwidth_enabled() &&
3874 !cpumask_equal(in_mask, span)) {
3875 retval = -EBUSY;
3876 goto out_unlock;
3877 }
3878 }
3879#endif
3880
3708 cpuset_cpus_allowed(p, cpus_allowed); 3881 cpuset_cpus_allowed(p, cpus_allowed);
3709 cpumask_and(new_mask, in_mask, cpus_allowed); 3882 cpumask_and(new_mask, in_mask, cpus_allowed);
3710again: 3883again:
@@ -4359,6 +4532,42 @@ out:
4359EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4532EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4360 4533
4361/* 4534/*
4535 * When dealing with a -deadline task, we have to check if moving it to
4536 * a new CPU is possible or not. In fact, this is only true iff there
4537 * is enough bandwidth available on such CPU, otherwise we want the
4538 * whole migration progedure to fail over.
4539 */
4540static inline
4541bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
4542{
4543 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
4544 struct dl_bw *cpu_b = dl_bw_of(cpu);
4545 int ret = 1;
4546 u64 bw;
4547
4548 if (dl_b == cpu_b)
4549 return 1;
4550
4551 raw_spin_lock(&dl_b->lock);
4552 raw_spin_lock(&cpu_b->lock);
4553
4554 bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span);
4555 if (dl_bandwidth_enabled() &&
4556 bw < cpu_b->total_bw + p->dl.dl_bw) {
4557 ret = 0;
4558 goto unlock;
4559 }
4560 dl_b->total_bw -= p->dl.dl_bw;
4561 cpu_b->total_bw += p->dl.dl_bw;
4562
4563unlock:
4564 raw_spin_unlock(&cpu_b->lock);
4565 raw_spin_unlock(&dl_b->lock);
4566
4567 return ret;
4568}
4569
4570/*
4362 * Move (not current) task off this cpu, onto dest cpu. We're doing 4571 * Move (not current) task off this cpu, onto dest cpu. We're doing
4363 * this because either it can't run here any more (set_cpus_allowed() 4572 * this because either it can't run here any more (set_cpus_allowed()
4364 * away from this CPU, or CPU going down), or because we're 4573 * away from this CPU, or CPU going down), or because we're
@@ -4390,6 +4599,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4390 goto fail; 4599 goto fail;
4391 4600
4392 /* 4601 /*
4602 * If p is -deadline, proceed only if there is enough
4603 * bandwidth available on dest_cpu
4604 */
4605 if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu))
4606 goto fail;
4607
4608 /*
4393 * If we're not on a rq, the next wake-up will ensure we're 4609 * If we're not on a rq, the next wake-up will ensure we're
4394 * placed properly. 4610 * placed properly.
4395 */ 4611 */
@@ -5128,6 +5344,8 @@ static int init_rootdomain(struct root_domain *rd)
5128 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5344 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5129 goto free_dlo_mask; 5345 goto free_dlo_mask;
5130 5346
5347 init_dl_bw(&rd->dl_bw);
5348
5131 if (cpupri_init(&rd->cpupri) != 0) 5349 if (cpupri_init(&rd->cpupri) != 0)
5132 goto free_rto_mask; 5350 goto free_rto_mask;
5133 return 0; 5351 return 0;
@@ -6557,13 +6775,15 @@ void __init sched_init(void)
6557#endif /* CONFIG_CPUMASK_OFFSTACK */ 6775#endif /* CONFIG_CPUMASK_OFFSTACK */
6558 } 6776 }
6559 6777
6778 init_rt_bandwidth(&def_rt_bandwidth,
6779 global_rt_period(), global_rt_runtime());
6780 init_dl_bandwidth(&def_dl_bandwidth,
6781 global_dl_period(), global_dl_runtime());
6782
6560#ifdef CONFIG_SMP 6783#ifdef CONFIG_SMP
6561 init_defrootdomain(); 6784 init_defrootdomain();
6562#endif 6785#endif
6563 6786
6564 init_rt_bandwidth(&def_rt_bandwidth,
6565 global_rt_period(), global_rt_runtime());
6566
6567#ifdef CONFIG_RT_GROUP_SCHED 6787#ifdef CONFIG_RT_GROUP_SCHED
6568 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6788 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6569 global_rt_period(), global_rt_runtime()); 6789 global_rt_period(), global_rt_runtime());
@@ -6966,16 +7186,6 @@ void sched_move_task(struct task_struct *tsk)
6966} 7186}
6967#endif /* CONFIG_CGROUP_SCHED */ 7187#endif /* CONFIG_CGROUP_SCHED */
6968 7188
6969#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6970static unsigned long to_ratio(u64 period, u64 runtime)
6971{
6972 if (runtime == RUNTIME_INF)
6973 return 1ULL << 20;
6974
6975 return div64_u64(runtime << 20, period);
6976}
6977#endif
6978
6979#ifdef CONFIG_RT_GROUP_SCHED 7189#ifdef CONFIG_RT_GROUP_SCHED
6980/* 7190/*
6981 * Ensure that the real time constraints are schedulable. 7191 * Ensure that the real time constraints are schedulable.
@@ -7149,10 +7359,48 @@ static long sched_group_rt_period(struct task_group *tg)
7149 do_div(rt_period_us, NSEC_PER_USEC); 7359 do_div(rt_period_us, NSEC_PER_USEC);
7150 return rt_period_us; 7360 return rt_period_us;
7151} 7361}
7362#endif /* CONFIG_RT_GROUP_SCHED */
7152 7363
7364/*
7365 * Coupling of -rt and -deadline bandwidth.
7366 *
7367 * Here we check if the new -rt bandwidth value is consistent
7368 * with the system settings for the bandwidth available
7369 * to -deadline tasks.
7370 *
7371 * IOW, we want to enforce that
7372 *
7373 * rt_bandwidth + dl_bandwidth <= 100%
7374 *
7375 * is always true.
7376 */
7377static bool __sched_rt_dl_global_constraints(u64 rt_bw)
7378{
7379 unsigned long flags;
7380 u64 dl_bw;
7381 bool ret;
7382
7383 raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags);
7384 if (global_rt_runtime() == RUNTIME_INF ||
7385 global_dl_runtime() == RUNTIME_INF) {
7386 ret = true;
7387 goto unlock;
7388 }
7389
7390 dl_bw = to_ratio(def_dl_bandwidth.dl_period,
7391 def_dl_bandwidth.dl_runtime);
7392
7393 ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
7394unlock:
7395 raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags);
7396
7397 return ret;
7398}
7399
7400#ifdef CONFIG_RT_GROUP_SCHED
7153static int sched_rt_global_constraints(void) 7401static int sched_rt_global_constraints(void)
7154{ 7402{
7155 u64 runtime, period; 7403 u64 runtime, period, bw;
7156 int ret = 0; 7404 int ret = 0;
7157 7405
7158 if (sysctl_sched_rt_period <= 0) 7406 if (sysctl_sched_rt_period <= 0)
@@ -7167,6 +7415,10 @@ static int sched_rt_global_constraints(void)
7167 if (runtime > period && runtime != RUNTIME_INF) 7415 if (runtime > period && runtime != RUNTIME_INF)
7168 return -EINVAL; 7416 return -EINVAL;
7169 7417
7418 bw = to_ratio(period, runtime);
7419 if (!__sched_rt_dl_global_constraints(bw))
7420 return -EINVAL;
7421
7170 mutex_lock(&rt_constraints_mutex); 7422 mutex_lock(&rt_constraints_mutex);
7171 read_lock(&tasklist_lock); 7423 read_lock(&tasklist_lock);
7172 ret = __rt_schedulable(NULL, 0, 0); 7424 ret = __rt_schedulable(NULL, 0, 0);
@@ -7189,19 +7441,19 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7189static int sched_rt_global_constraints(void) 7441static int sched_rt_global_constraints(void)
7190{ 7442{
7191 unsigned long flags; 7443 unsigned long flags;
7192 int i; 7444 int i, ret = 0;
7445 u64 bw;
7193 7446
7194 if (sysctl_sched_rt_period <= 0) 7447 if (sysctl_sched_rt_period <= 0)
7195 return -EINVAL; 7448 return -EINVAL;
7196 7449
7197 /*
7198 * There's always some RT tasks in the root group
7199 * -- migration, kstopmachine etc..
7200 */
7201 if (sysctl_sched_rt_runtime == 0)
7202 return -EBUSY;
7203
7204 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7450 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7451 bw = to_ratio(global_rt_period(), global_rt_runtime());
7452 if (!__sched_rt_dl_global_constraints(bw)) {
7453 ret = -EINVAL;
7454 goto unlock;
7455 }
7456
7205 for_each_possible_cpu(i) { 7457 for_each_possible_cpu(i) {
7206 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7458 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7207 7459
@@ -7209,12 +7461,93 @@ static int sched_rt_global_constraints(void)
7209 rt_rq->rt_runtime = global_rt_runtime(); 7461 rt_rq->rt_runtime = global_rt_runtime();
7210 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7462 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7211 } 7463 }
7464unlock:
7212 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7465 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7213 7466
7214 return 0; 7467 return ret;
7215} 7468}
7216#endif /* CONFIG_RT_GROUP_SCHED */ 7469#endif /* CONFIG_RT_GROUP_SCHED */
7217 7470
7471/*
7472 * Coupling of -dl and -rt bandwidth.
7473 *
7474 * Here we check, while setting the system wide bandwidth available
7475 * for -dl tasks and groups, if the new values are consistent with
7476 * the system settings for the bandwidth available to -rt entities.
7477 *
7478 * IOW, we want to enforce that
7479 *
7480 * rt_bandwidth + dl_bandwidth <= 100%
7481 *
7482 * is always true.
7483 */
7484static bool __sched_dl_rt_global_constraints(u64 dl_bw)
7485{
7486 u64 rt_bw;
7487 bool ret;
7488
7489 raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
7490 if (global_dl_runtime() == RUNTIME_INF ||
7491 global_rt_runtime() == RUNTIME_INF) {
7492 ret = true;
7493 goto unlock;
7494 }
7495
7496 rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period),
7497 def_rt_bandwidth.rt_runtime);
7498
7499 ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
7500unlock:
7501 raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
7502
7503 return ret;
7504}
7505
7506static bool __sched_dl_global_constraints(u64 runtime, u64 period)
7507{
7508 if (!period || (runtime != RUNTIME_INF && runtime > period))
7509 return -EINVAL;
7510
7511 return 0;
7512}
7513
7514static int sched_dl_global_constraints(void)
7515{
7516 u64 runtime = global_dl_runtime();
7517 u64 period = global_dl_period();
7518 u64 new_bw = to_ratio(period, runtime);
7519 int ret, i;
7520
7521 ret = __sched_dl_global_constraints(runtime, period);
7522 if (ret)
7523 return ret;
7524
7525 if (!__sched_dl_rt_global_constraints(new_bw))
7526 return -EINVAL;
7527
7528 /*
7529 * Here we want to check the bandwidth not being set to some
7530 * value smaller than the currently allocated bandwidth in
7531 * any of the root_domains.
7532 *
7533 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
7534 * cycling on root_domains... Discussion on different/better
7535 * solutions is welcome!
7536 */
7537 for_each_possible_cpu(i) {
7538 struct dl_bw *dl_b = dl_bw_of(i);
7539
7540 raw_spin_lock(&dl_b->lock);
7541 if (new_bw < dl_b->total_bw) {
7542 raw_spin_unlock(&dl_b->lock);
7543 return -EBUSY;
7544 }
7545 raw_spin_unlock(&dl_b->lock);
7546 }
7547
7548 return 0;
7549}
7550
7218int sched_rr_handler(struct ctl_table *table, int write, 7551int sched_rr_handler(struct ctl_table *table, int write,
7219 void __user *buffer, size_t *lenp, 7552 void __user *buffer, size_t *lenp,
7220 loff_t *ppos) 7553 loff_t *ppos)
@@ -7264,6 +7597,60 @@ int sched_rt_handler(struct ctl_table *table, int write,
7264 return ret; 7597 return ret;
7265} 7598}
7266 7599
7600int sched_dl_handler(struct ctl_table *table, int write,
7601 void __user *buffer, size_t *lenp,
7602 loff_t *ppos)
7603{
7604 int ret;
7605 int old_period, old_runtime;
7606 static DEFINE_MUTEX(mutex);
7607 unsigned long flags;
7608
7609 mutex_lock(&mutex);
7610 old_period = sysctl_sched_dl_period;
7611 old_runtime = sysctl_sched_dl_runtime;
7612
7613 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7614
7615 if (!ret && write) {
7616 raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock,
7617 flags);
7618
7619 ret = sched_dl_global_constraints();
7620 if (ret) {
7621 sysctl_sched_dl_period = old_period;
7622 sysctl_sched_dl_runtime = old_runtime;
7623 } else {
7624 u64 new_bw;
7625 int i;
7626
7627 def_dl_bandwidth.dl_period = global_dl_period();
7628 def_dl_bandwidth.dl_runtime = global_dl_runtime();
7629 if (global_dl_runtime() == RUNTIME_INF)
7630 new_bw = -1;
7631 else
7632 new_bw = to_ratio(global_dl_period(),
7633 global_dl_runtime());
7634 /*
7635 * FIXME: As above...
7636 */
7637 for_each_possible_cpu(i) {
7638 struct dl_bw *dl_b = dl_bw_of(i);
7639
7640 raw_spin_lock(&dl_b->lock);
7641 dl_b->bw = new_bw;
7642 raw_spin_unlock(&dl_b->lock);
7643 }
7644 }
7645
7646 raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
7647 flags);
7648 }
7649 mutex_unlock(&mutex);
7650
7651 return ret;
7652}
7653
7267#ifdef CONFIG_CGROUP_SCHED 7654#ifdef CONFIG_CGROUP_SCHED
7268 7655
7269static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7656static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7f6de4316990..802188fb6338 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
16 */ 16 */
17#include "sched.h" 17#include "sched.h"
18 18
19struct dl_bandwidth def_dl_bandwidth;
20
19static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 21static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
20{ 22{
21 return container_of(dl_se, struct task_struct, dl); 23 return container_of(dl_se, struct task_struct, dl);
@@ -46,6 +48,27 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
46 return dl_rq->rb_leftmost == &dl_se->rb_node; 48 return dl_rq->rb_leftmost == &dl_se->rb_node;
47} 49}
48 50
51void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
52{
53 raw_spin_lock_init(&dl_b->dl_runtime_lock);
54 dl_b->dl_period = period;
55 dl_b->dl_runtime = runtime;
56}
57
58extern unsigned long to_ratio(u64 period, u64 runtime);
59
60void init_dl_bw(struct dl_bw *dl_b)
61{
62 raw_spin_lock_init(&dl_b->lock);
63 raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
64 if (global_dl_runtime() == RUNTIME_INF)
65 dl_b->bw = -1;
66 else
67 dl_b->bw = to_ratio(global_dl_period(), global_dl_runtime());
68 raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
69 dl_b->total_bw = 0;
70}
71
49void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) 72void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
50{ 73{
51 dl_rq->rb_root = RB_ROOT; 74 dl_rq->rb_root = RB_ROOT;
@@ -57,6 +80,8 @@ void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
57 dl_rq->dl_nr_migratory = 0; 80 dl_rq->dl_nr_migratory = 0;
58 dl_rq->overloaded = 0; 81 dl_rq->overloaded = 0;
59 dl_rq->pushable_dl_tasks_root = RB_ROOT; 82 dl_rq->pushable_dl_tasks_root = RB_ROOT;
83#else
84 init_dl_bw(&dl_rq->dl_bw);
60#endif 85#endif
61} 86}
62 87
@@ -359,8 +384,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
359 * of anything below microseconds resolution is actually fiction 384 * of anything below microseconds resolution is actually fiction
360 * (but still we want to give the user that illusion >;). 385 * (but still we want to give the user that illusion >;).
361 */ 386 */
362 left = (pi_se->dl_period >> 10) * (dl_se->runtime >> 10); 387 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
363 right = ((dl_se->deadline - t) >> 10) * (pi_se->dl_runtime >> 10); 388 right = ((dl_se->deadline - t) >> DL_SCALE) *
389 (pi_se->dl_runtime >> DL_SCALE);
364 390
365 return dl_time_before(right, left); 391 return dl_time_before(right, left);
366} 392}
@@ -911,8 +937,8 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
911 * In the unlikely case current and p have the same deadline 937 * In the unlikely case current and p have the same deadline
912 * let us try to decide what's the best thing to do... 938 * let us try to decide what's the best thing to do...
913 */ 939 */
914 if ((s64)(p->dl.deadline - rq->curr->dl.deadline) == 0 && 940 if ((p->dl.deadline == rq->curr->dl.deadline) &&
915 !need_resched()) 941 !test_tsk_need_resched(rq->curr))
916 check_preempt_equal_dl(rq, p); 942 check_preempt_equal_dl(rq, p);
917#endif /* CONFIG_SMP */ 943#endif /* CONFIG_SMP */
918} 944}
@@ -1000,6 +1026,14 @@ static void task_fork_dl(struct task_struct *p)
1000static void task_dead_dl(struct task_struct *p) 1026static void task_dead_dl(struct task_struct *p)
1001{ 1027{
1002 struct hrtimer *timer = &p->dl.dl_timer; 1028 struct hrtimer *timer = &p->dl.dl_timer;
1029 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1030
1031 /*
1032 * Since we are TASK_DEAD we won't slip out of the domain!
1033 */
1034 raw_spin_lock_irq(&dl_b->lock);
1035 dl_b->total_bw -= p->dl.dl_bw;
1036 raw_spin_unlock_irq(&dl_b->lock);
1003 1037
1004 hrtimer_cancel(timer); 1038 hrtimer_cancel(timer);
1005} 1039}
@@ -1226,7 +1260,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1226 BUG_ON(task_current(rq, p)); 1260 BUG_ON(task_current(rq, p));
1227 BUG_ON(p->nr_cpus_allowed <= 1); 1261 BUG_ON(p->nr_cpus_allowed <= 1);
1228 1262
1229 BUG_ON(!p->se.on_rq); 1263 BUG_ON(!p->on_rq);
1230 BUG_ON(!dl_task(p)); 1264 BUG_ON(!dl_task(p));
1231 1265
1232 return p; 1266 return p;
@@ -1373,7 +1407,7 @@ static int pull_dl_task(struct rq *this_rq)
1373 dl_time_before(p->dl.deadline, 1407 dl_time_before(p->dl.deadline,
1374 this_rq->dl.earliest_dl.curr))) { 1408 this_rq->dl.earliest_dl.curr))) {
1375 WARN_ON(p == src_rq->curr); 1409 WARN_ON(p == src_rq->curr);
1376 WARN_ON(!p->se.on_rq); 1410 WARN_ON(!p->on_rq);
1377 1411
1378 /* 1412 /*
1379 * Then we pull iff p has actually an earlier 1413 * Then we pull iff p has actually an earlier
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 52453a2d0a79..ad4f4fbd002e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,6 +74,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
74#define NICE_0_SHIFT SCHED_LOAD_SHIFT 74#define NICE_0_SHIFT SCHED_LOAD_SHIFT
75 75
76/* 76/*
77 * Single value that decides SCHED_DEADLINE internal math precision.
78 * 10 -> just above 1us
79 * 9 -> just above 0.5us
80 */
81#define DL_SCALE (10)
82
83/*
77 * These are the 'tuning knobs' of the scheduler: 84 * These are the 'tuning knobs' of the scheduler:
78 */ 85 */
79 86
@@ -107,7 +114,7 @@ static inline int task_has_dl_policy(struct task_struct *p)
107 return dl_policy(p->policy); 114 return dl_policy(p->policy);
108} 115}
109 116
110static inline int dl_time_before(u64 a, u64 b) 117static inline bool dl_time_before(u64 a, u64 b)
111{ 118{
112 return (s64)(a - b) < 0; 119 return (s64)(a - b) < 0;
113} 120}
@@ -115,8 +122,8 @@ static inline int dl_time_before(u64 a, u64 b)
115/* 122/*
116 * Tells if entity @a should preempt entity @b. 123 * Tells if entity @a should preempt entity @b.
117 */ 124 */
118static inline 125static inline bool
119int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) 126dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
120{ 127{
121 return dl_time_before(a->deadline, b->deadline); 128 return dl_time_before(a->deadline, b->deadline);
122} 129}
@@ -136,6 +143,50 @@ struct rt_bandwidth {
136 u64 rt_runtime; 143 u64 rt_runtime;
137 struct hrtimer rt_period_timer; 144 struct hrtimer rt_period_timer;
138}; 145};
146/*
147 * To keep the bandwidth of -deadline tasks and groups under control
148 * we need some place where:
149 * - store the maximum -deadline bandwidth of the system (the group);
150 * - cache the fraction of that bandwidth that is currently allocated.
151 *
152 * This is all done in the data structure below. It is similar to the
153 * one used for RT-throttling (rt_bandwidth), with the main difference
154 * that, since here we are only interested in admission control, we
155 * do not decrease any runtime while the group "executes", neither we
156 * need a timer to replenish it.
157 *
158 * With respect to SMP, the bandwidth is given on a per-CPU basis,
159 * meaning that:
160 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
161 * - dl_total_bw array contains, in the i-eth element, the currently
162 * allocated bandwidth on the i-eth CPU.
163 * Moreover, groups consume bandwidth on each CPU, while tasks only
164 * consume bandwidth on the CPU they're running on.
165 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
166 * that will be shown the next time the proc or cgroup controls will
167 * be red. It on its turn can be changed by writing on its own
168 * control.
169 */
170struct dl_bandwidth {
171 raw_spinlock_t dl_runtime_lock;
172 u64 dl_runtime;
173 u64 dl_period;
174};
175
176static inline int dl_bandwidth_enabled(void)
177{
178 return sysctl_sched_dl_runtime >= 0;
179}
180
181extern struct dl_bw *dl_bw_of(int i);
182
183struct dl_bw {
184 raw_spinlock_t lock;
185 u64 bw, total_bw;
186};
187
188static inline u64 global_dl_period(void);
189static inline u64 global_dl_runtime(void);
139 190
140extern struct mutex sched_domains_mutex; 191extern struct mutex sched_domains_mutex;
141 192
@@ -423,6 +474,8 @@ struct dl_rq {
423 */ 474 */
424 struct rb_root pushable_dl_tasks_root; 475 struct rb_root pushable_dl_tasks_root;
425 struct rb_node *pushable_dl_tasks_leftmost; 476 struct rb_node *pushable_dl_tasks_leftmost;
477#else
478 struct dl_bw dl_bw;
426#endif 479#endif
427}; 480};
428 481
@@ -449,6 +502,7 @@ struct root_domain {
449 */ 502 */
450 cpumask_var_t dlo_mask; 503 cpumask_var_t dlo_mask;
451 atomic_t dlo_count; 504 atomic_t dlo_count;
505 struct dl_bw dl_bw;
452 506
453 /* 507 /*
454 * The "RT overload" flag: it gets set if a CPU has more than 508 * The "RT overload" flag: it gets set if a CPU has more than
@@ -897,7 +951,18 @@ static inline u64 global_rt_runtime(void)
897 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 951 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
898} 952}
899 953
954static inline u64 global_dl_period(void)
955{
956 return (u64)sysctl_sched_dl_period * NSEC_PER_USEC;
957}
958
959static inline u64 global_dl_runtime(void)
960{
961 if (sysctl_sched_dl_runtime < 0)
962 return RUNTIME_INF;
900 963
964 return (u64)sysctl_sched_dl_runtime * NSEC_PER_USEC;
965}
901 966
902static inline int task_current(struct rq *rq, struct task_struct *p) 967static inline int task_current(struct rq *rq, struct task_struct *p)
903{ 968{
@@ -1145,6 +1210,7 @@ extern void update_max_interval(void);
1145extern void init_sched_dl_class(void); 1210extern void init_sched_dl_class(void);
1146extern void init_sched_rt_class(void); 1211extern void init_sched_rt_class(void);
1147extern void init_sched_fair_class(void); 1212extern void init_sched_fair_class(void);
1213extern void init_sched_dl_class(void);
1148 1214
1149extern void resched_task(struct task_struct *p); 1215extern void resched_task(struct task_struct *p);
1150extern void resched_cpu(int cpu); 1216extern void resched_cpu(int cpu);
@@ -1152,8 +1218,12 @@ extern void resched_cpu(int cpu);
1152extern struct rt_bandwidth def_rt_bandwidth; 1218extern struct rt_bandwidth def_rt_bandwidth;
1153extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1219extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1154 1220
1221extern struct dl_bandwidth def_dl_bandwidth;
1222extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1155extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1223extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1156 1224
1225unsigned long to_ratio(u64 period, u64 runtime);
1226
1157extern void update_idle_cpu_load(struct rq *this_rq); 1227extern void update_idle_cpu_load(struct rq *this_rq);
1158 1228
1159extern void init_task_runnable_average(struct task_struct *p); 1229extern void init_task_runnable_average(struct task_struct *p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8da99f905cf..c7fb0790ac63 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -414,6 +414,20 @@ static struct ctl_table kern_table[] = {
414 .mode = 0644, 414 .mode = 0644,
415 .proc_handler = sched_rr_handler, 415 .proc_handler = sched_rr_handler,
416 }, 416 },
417 {
418 .procname = "sched_dl_period_us",
419 .data = &sysctl_sched_dl_period,
420 .maxlen = sizeof(unsigned int),
421 .mode = 0644,
422 .proc_handler = sched_dl_handler,
423 },
424 {
425 .procname = "sched_dl_runtime_us",
426 .data = &sysctl_sched_dl_runtime,
427 .maxlen = sizeof(int),
428 .mode = 0644,
429 .proc_handler = sched_dl_handler,
430 },
417#ifdef CONFIG_SCHED_AUTOGROUP 431#ifdef CONFIG_SCHED_AUTOGROUP
418 { 432 {
419 .procname = "sched_autogroup_enabled", 433 .procname = "sched_autogroup_enabled",