aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDario Faggioli <raistlin@linux.it>2013-11-07 08:43:45 -0500
committerIngo Molnar <mingo@kernel.org>2014-01-13 07:46:42 -0500
commit332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 (patch)
tree84c6663542da4310c5c555afaac88ac9b696fe4b
parent2d3d891d3344159d5b452a645e355bbe29591e8b (diff)
sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks
In order of deadline scheduling to be effective and useful, it is important that some method of having the allocation of the available CPU bandwidth to tasks and task groups under control. This is usually called "admission control" and if it is not performed at all, no guarantee can be given on the actual scheduling of the -deadline tasks. Since when RT-throttling has been introduced each task group have a bandwidth associated to itself, calculated as a certain amount of runtime over a period. Moreover, to make it possible to manipulate such bandwidth, readable/writable controls have been added to both procfs (for system wide settings) and cgroupfs (for per-group settings). Therefore, the same interface is being used for controlling the bandwidth distrubution to -deadline tasks and task groups, i.e., new controls but with similar names, equivalent meaning and with the same usage paradigm are added. However, more discussion is needed in order to figure out how we want to manage SCHED_DEADLINE bandwidth at the task group level. Therefore, this patch adds a less sophisticated, but actually very sensible, mechanism to ensure that a certain utilization cap is not overcome per each root_domain (the single rq for !SMP configurations). Another main difference between deadline bandwidth management and RT-throttling is that -deadline tasks have bandwidth on their own (while -rt ones doesn't!), and thus we don't need an higher level throttling mechanism to enforce the desired bandwidth. This patch, therefore: - adds system wide deadline bandwidth management by means of: * /proc/sys/kernel/sched_dl_runtime_us, * /proc/sys/kernel/sched_dl_period_us, that determine (i.e., runtime / period) the total bandwidth available on each CPU of each root_domain for -deadline tasks; - couples the RT and deadline bandwidth management, i.e., enforces that the sum of how much bandwidth is being devoted to -rt -deadline tasks to stay below 100%. This means that, for a root_domain comprising M CPUs, -deadline tasks can be created until the sum of their bandwidths stay below: M * (sched_dl_runtime_us / sched_dl_period_us) It is also possible to disable this bandwidth management logic, and be thus free of oversubscribing the system up to any arbitrary level. Signed-off-by: Dario Faggioli <raistlin@linux.it> Signed-off-by: Juri Lelli <juri.lelli@gmail.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1383831828-15501-12-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/sched/sysctl.h13
-rw-r--r--kernel/sched/core.c441
-rw-r--r--kernel/sched/deadline.c46
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/sysctl.c14
6 files changed, 555 insertions, 36 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13c53a99920f..a196cb7fc6f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1104,6 +1104,7 @@ struct sched_dl_entity {
1104 u64 dl_runtime; /* maximum runtime for each instance */ 1104 u64 dl_runtime; /* maximum runtime for each instance */
1105 u64 dl_deadline; /* relative deadline of each instance */ 1105 u64 dl_deadline; /* relative deadline of each instance */
1106 u64 dl_period; /* separation of two instances (period) */ 1106 u64 dl_period; /* separation of two instances (period) */
1107 u64 dl_bw; /* dl_runtime / dl_deadline */
1107 1108
1108 /* 1109 /*
1109 * Actual scheduling parameters. Initialized with the values above, 1110 * Actual scheduling parameters. Initialized with the values above,
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 31e0193cb0c5..8070a83dbedc 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -81,6 +81,15 @@ static inline unsigned int get_sysctl_timer_migration(void)
81extern unsigned int sysctl_sched_rt_period; 81extern unsigned int sysctl_sched_rt_period;
82extern int sysctl_sched_rt_runtime; 82extern int sysctl_sched_rt_runtime;
83 83
84/*
85 * control SCHED_DEADLINE reservations:
86 *
87 * /proc/sys/kernel/sched_dl_period_us
88 * /proc/sys/kernel/sched_dl_runtime_us
89 */
90extern unsigned int sysctl_sched_dl_period;
91extern int sysctl_sched_dl_runtime;
92
84#ifdef CONFIG_CFS_BANDWIDTH 93#ifdef CONFIG_CFS_BANDWIDTH
85extern unsigned int sysctl_sched_cfs_bandwidth_slice; 94extern unsigned int sysctl_sched_cfs_bandwidth_slice;
86#endif 95#endif
@@ -99,4 +108,8 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
99 void __user *buffer, size_t *lenp, 108 void __user *buffer, size_t *lenp,
100 loff_t *ppos); 109 loff_t *ppos);
101 110
111int sched_dl_handler(struct ctl_table *table, int write,
112 void __user *buffer, size_t *lenp,
113 loff_t *ppos);
114
102#endif /* _SCHED_SYSCTL_H */ 115#endif /* _SCHED_SYSCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 599ee3b11b44..c7c68e6b5c51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,6 +296,15 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299/*
300 * Maximum bandwidth available for all -deadline tasks and groups
301 * (if group scheduling is configured) on each CPU.
302 *
303 * default: 5%
304 */
305unsigned int sysctl_sched_dl_period = 1000000;
306int sysctl_sched_dl_runtime = 50000;
307
299 308
300 309
301/* 310/*
@@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
1856 return 0; 1865 return 0;
1857} 1866}
1858 1867
1868unsigned long to_ratio(u64 period, u64 runtime)
1869{
1870 if (runtime == RUNTIME_INF)
1871 return 1ULL << 20;
1872
1873 /*
1874 * Doing this here saves a lot of checks in all
1875 * the calling paths, and returning zero seems
1876 * safe for them anyway.
1877 */
1878 if (period == 0)
1879 return 0;
1880
1881 return div64_u64(runtime << 20, period);
1882}
1883
1884#ifdef CONFIG_SMP
1885inline struct dl_bw *dl_bw_of(int i)
1886{
1887 return &cpu_rq(i)->rd->dl_bw;
1888}
1889
1890static inline int __dl_span_weight(struct rq *rq)
1891{
1892 return cpumask_weight(rq->rd->span);
1893}
1894#else
1895inline struct dl_bw *dl_bw_of(int i)
1896{
1897 return &cpu_rq(i)->dl.dl_bw;
1898}
1899
1900static inline int __dl_span_weight(struct rq *rq)
1901{
1902 return 1;
1903}
1904#endif
1905
1906static inline
1907void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1908{
1909 dl_b->total_bw -= tsk_bw;
1910}
1911
1912static inline
1913void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1914{
1915 dl_b->total_bw += tsk_bw;
1916}
1917
1918static inline
1919bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1920{
1921 return dl_b->bw != -1 &&
1922 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1923}
1924
1925/*
1926 * We must be sure that accepting a new task (or allowing changing the
1927 * parameters of an existing one) is consistent with the bandwidth
1928 * constraints. If yes, this function also accordingly updates the currently
1929 * allocated bandwidth to reflect the new situation.
1930 *
1931 * This function is called while holding p's rq->lock.
1932 */
1933static int dl_overflow(struct task_struct *p, int policy,
1934 const struct sched_attr *attr)
1935{
1936
1937 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1938 u64 period = attr->sched_period;
1939 u64 runtime = attr->sched_runtime;
1940 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1941 int cpus = __dl_span_weight(task_rq(p));
1942 int err = -1;
1943
1944 if (new_bw == p->dl.dl_bw)
1945 return 0;
1946
1947 /*
1948 * Either if a task, enters, leave, or stays -deadline but changes
1949 * its parameters, we may need to update accordingly the total
1950 * allocated bandwidth of the container.
1951 */
1952 raw_spin_lock(&dl_b->lock);
1953 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1954 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1955 __dl_add(dl_b, new_bw);
1956 err = 0;
1957 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1958 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1959 __dl_clear(dl_b, p->dl.dl_bw);
1960 __dl_add(dl_b, new_bw);
1961 err = 0;
1962 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1963 __dl_clear(dl_b, p->dl.dl_bw);
1964 err = 0;
1965 }
1966 raw_spin_unlock(&dl_b->lock);
1967
1968 return err;
1969}
1970
1971extern void init_dl_bw(struct dl_bw *dl_b);
1972
1859/* 1973/*
1860 * wake_up_new_task - wake up a newly created task for the first time. 1974 * wake_up_new_task - wake up a newly created task for the first time.
1861 * 1975 *
@@ -3053,6 +3167,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3053 dl_se->dl_deadline = attr->sched_deadline; 3167 dl_se->dl_deadline = attr->sched_deadline;
3054 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3168 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3055 dl_se->flags = attr->sched_flags; 3169 dl_se->flags = attr->sched_flags;
3170 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3056 dl_se->dl_throttled = 0; 3171 dl_se->dl_throttled = 0;
3057 dl_se->dl_new = 1; 3172 dl_se->dl_new = 1;
3058} 3173}
@@ -3101,7 +3216,9 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3101 * This function validates the new parameters of a -deadline task. 3216 * This function validates the new parameters of a -deadline task.
3102 * We ask for the deadline not being zero, and greater or equal 3217 * We ask for the deadline not being zero, and greater or equal
3103 * than the runtime, as well as the period of being zero or 3218 * than the runtime, as well as the period of being zero or
3104 * greater than deadline. 3219 * greater than deadline. Furthermore, we have to be sure that
3220 * user parameters are above the internal resolution (1us); we
3221 * check sched_runtime only since it is always the smaller one.
3105 */ 3222 */
3106static bool 3223static bool
3107__checkparam_dl(const struct sched_attr *attr) 3224__checkparam_dl(const struct sched_attr *attr)
@@ -3109,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr)
3109 return attr && attr->sched_deadline != 0 && 3226 return attr && attr->sched_deadline != 0 &&
3110 (attr->sched_period == 0 || 3227 (attr->sched_period == 0 ||
3111 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3228 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3112 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0; 3229 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3230 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3113} 3231}
3114 3232
3115/* 3233/*
@@ -3250,8 +3368,8 @@ recheck:
3250 } 3368 }
3251change: 3369change:
3252 3370
3253#ifdef CONFIG_RT_GROUP_SCHED
3254 if (user) { 3371 if (user) {
3372#ifdef CONFIG_RT_GROUP_SCHED
3255 /* 3373 /*
3256 * Do not allow realtime tasks into groups that have no runtime 3374 * Do not allow realtime tasks into groups that have no runtime
3257 * assigned. 3375 * assigned.
@@ -3262,8 +3380,33 @@ change:
3262 task_rq_unlock(rq, p, &flags); 3380 task_rq_unlock(rq, p, &flags);
3263 return -EPERM; 3381 return -EPERM;
3264 } 3382 }
3265 }
3266#endif 3383#endif
3384#ifdef CONFIG_SMP
3385 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3386 cpumask_t *span = rq->rd->span;
3387 cpumask_t act_affinity;
3388
3389 /*
3390 * cpus_allowed mask is statically initialized with
3391 * CPU_MASK_ALL, span is instead dynamic. Here we
3392 * compute the "dynamic" affinity of a task.
3393 */
3394 cpumask_and(&act_affinity, &p->cpus_allowed,
3395 cpu_active_mask);
3396
3397 /*
3398 * Don't allow tasks with an affinity mask smaller than
3399 * the entire root_domain to become SCHED_DEADLINE. We
3400 * will also fail if there's no bandwidth available.
3401 */
3402 if (!cpumask_equal(&act_affinity, span) ||
3403 rq->rd->dl_bw.bw == 0) {
3404 task_rq_unlock(rq, p, &flags);
3405 return -EPERM;
3406 }
3407 }
3408#endif
3409 }
3267 3410
3268 /* recheck policy now with rq lock held */ 3411 /* recheck policy now with rq lock held */
3269 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3412 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3271,6 +3414,18 @@ change:
3271 task_rq_unlock(rq, p, &flags); 3414 task_rq_unlock(rq, p, &flags);
3272 goto recheck; 3415 goto recheck;
3273 } 3416 }
3417
3418 /*
3419 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3420 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3421 * is available.
3422 */
3423 if ((dl_policy(policy) || dl_task(p)) &&
3424 dl_overflow(p, policy, attr)) {
3425 task_rq_unlock(rq, p, &flags);
3426 return -EBUSY;
3427 }
3428
3274 on_rq = p->on_rq; 3429 on_rq = p->on_rq;
3275 running = task_current(rq, p); 3430 running = task_current(rq, p);
3276 if (on_rq) 3431 if (on_rq)
@@ -3705,6 +3860,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3705 if (retval) 3860 if (retval)
3706 goto out_unlock; 3861 goto out_unlock;
3707 3862
3863 /*
3864 * Since bandwidth control happens on root_domain basis,
3865 * if admission test is enabled, we only admit -deadline
3866 * tasks allowed to run on all the CPUs in the task's
3867 * root_domain.
3868 */
3869#ifdef CONFIG_SMP
3870 if (task_has_dl_policy(p)) {
3871 const struct cpumask *span = task_rq(p)->rd->span;
3872
3873 if (dl_bandwidth_enabled() &&
3874 !cpumask_equal(in_mask, span)) {
3875 retval = -EBUSY;
3876 goto out_unlock;
3877 }
3878 }
3879#endif
3880
3708 cpuset_cpus_allowed(p, cpus_allowed); 3881 cpuset_cpus_allowed(p, cpus_allowed);
3709 cpumask_and(new_mask, in_mask, cpus_allowed); 3882 cpumask_and(new_mask, in_mask, cpus_allowed);
3710again: 3883again:
@@ -4359,6 +4532,42 @@ out:
4359EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4532EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4360 4533
4361/* 4534/*
4535 * When dealing with a -deadline task, we have to check if moving it to
4536 * a new CPU is possible or not. In fact, this is only true iff there
4537 * is enough bandwidth available on such CPU, otherwise we want the
4538 * whole migration progedure to fail over.
4539 */
4540static inline
4541bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
4542{
4543 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
4544 struct dl_bw *cpu_b = dl_bw_of(cpu);
4545 int ret = 1;
4546 u64 bw;
4547
4548 if (dl_b == cpu_b)
4549 return 1;
4550
4551 raw_spin_lock(&dl_b->lock);
4552 raw_spin_lock(&cpu_b->lock);
4553
4554 bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span);
4555 if (dl_bandwidth_enabled() &&
4556 bw < cpu_b->total_bw + p->dl.dl_bw) {
4557 ret = 0;
4558 goto unlock;
4559 }
4560 dl_b->total_bw -= p->dl.dl_bw;
4561 cpu_b->total_bw += p->dl.dl_bw;
4562
4563unlock:
4564 raw_spin_unlock(&cpu_b->lock);
4565 raw_spin_unlock(&dl_b->lock);
4566
4567 return ret;
4568}
4569
4570/*
4362 * Move (not current) task off this cpu, onto dest cpu. We're doing 4571 * Move (not current) task off this cpu, onto dest cpu. We're doing
4363 * this because either it can't run here any more (set_cpus_allowed() 4572 * this because either it can't run here any more (set_cpus_allowed()
4364 * away from this CPU, or CPU going down), or because we're 4573 * away from this CPU, or CPU going down), or because we're
@@ -4390,6 +4599,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4390 goto fail; 4599 goto fail;
4391 4600
4392 /* 4601 /*
4602 * If p is -deadline, proceed only if there is enough
4603 * bandwidth available on dest_cpu
4604 */
4605 if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu))
4606 goto fail;
4607
4608 /*
4393 * If we're not on a rq, the next wake-up will ensure we're 4609 * If we're not on a rq, the next wake-up will ensure we're
4394 * placed properly. 4610 * placed properly.
4395 */ 4611 */
@@ -5128,6 +5344,8 @@ static int init_rootdomain(struct root_domain *rd)
5128 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5344 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5129 goto free_dlo_mask; 5345 goto free_dlo_mask;
5130 5346
5347 init_dl_bw(&rd->dl_bw);
5348
5131 if (cpupri_init(&rd->cpupri) != 0) 5349 if (cpupri_init(&rd->cpupri) != 0)
5132 goto free_rto_mask; 5350 goto free_rto_mask;
5133 return 0; 5351 return 0;
@@ -6557,13 +6775,15 @@ void __init sched_init(void)
6557#endif /* CONFIG_CPUMASK_OFFSTACK */ 6775#endif /* CONFIG_CPUMASK_OFFSTACK */
6558 } 6776 }
6559 6777
6778 init_rt_bandwidth(&def_rt_bandwidth,
6779 global_rt_period(), global_rt_runtime());
6780 init_dl_bandwidth(&def_dl_bandwidth,
6781 global_dl_period(), global_dl_runtime());
6782
6560#ifdef CONFIG_SMP 6783#ifdef CONFIG_SMP
6561 init_defrootdomain(); 6784 init_defrootdomain();
6562#endif 6785#endif
6563 6786
6564 init_rt_bandwidth(&def_rt_bandwidth,
6565 global_rt_period(), global_rt_runtime());
6566
6567#ifdef CONFIG_RT_GROUP_SCHED 6787#ifdef CONFIG_RT_GROUP_SCHED
6568 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6788 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6569 global_rt_period(), global_rt_runtime()); 6789 global_rt_period(), global_rt_runtime());
@@ -6966,16 +7186,6 @@ void sched_move_task(struct task_struct *tsk)
6966} 7186}
6967#endif /* CONFIG_CGROUP_SCHED */ 7187#endif /* CONFIG_CGROUP_SCHED */
6968 7188
6969#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6970static unsigned long to_ratio(u64 period, u64 runtime)
6971{
6972 if (runtime == RUNTIME_INF)
6973 return 1ULL << 20;
6974
6975 return div64_u64(runtime << 20, period);
6976}
6977#endif
6978
6979#ifdef CONFIG_RT_GROUP_SCHED 7189#ifdef CONFIG_RT_GROUP_SCHED
6980/* 7190/*
6981 * Ensure that the real time constraints are schedulable. 7191 * Ensure that the real time constraints are schedulable.
@@ -7149,10 +7359,48 @@ static long sched_group_rt_period(struct task_group *tg)
7149 do_div(rt_period_us, NSEC_PER_USEC); 7359 do_div(rt_period_us, NSEC_PER_USEC);
7150 return rt_period_us; 7360 return rt_period_us;
7151} 7361}
7362#endif /* CONFIG_RT_GROUP_SCHED */
7152 7363
7364/*
7365 * Coupling of -rt and -deadline bandwidth.
7366 *
7367 * Here we check if the new -rt bandwidth value is consistent
7368 * with the system settings for the bandwidth available
7369 * to -deadline tasks.
7370 *
7371 * IOW, we want to enforce that
7372 *
7373 * rt_bandwidth + dl_bandwidth <= 100%
7374 *
7375 * is always true.
7376 */
7377static bool __sched_rt_dl_global_constraints(u64 rt_bw)
7378{
7379 unsigned long flags;
7380 u64 dl_bw;
7381 bool ret;
7382
7383 raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags);
7384 if (global_rt_runtime() == RUNTIME_INF ||
7385 global_dl_runtime() == RUNTIME_INF) {
7386 ret = true;
7387 goto unlock;
7388 }
7389
7390 dl_bw = to_ratio(def_dl_bandwidth.dl_period,
7391 def_dl_bandwidth.dl_runtime);
7392
7393 ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
7394unlock:
7395 raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags);
7396
7397 return ret;
7398}
7399
7400#ifdef CONFIG_RT_GROUP_SCHED
7153static int sched_rt_global_constraints(void) 7401static int sched_rt_global_constraints(void)
7154{ 7402{
7155 u64 runtime, period; 7403 u64 runtime, period, bw;
7156 int ret = 0; 7404 int ret = 0;
7157 7405
7158 if (sysctl_sched_rt_period <= 0) 7406 if (sysctl_sched_rt_period <= 0)
@@ -7167,6 +7415,10 @@ static int sched_rt_global_constraints(void)
7167 if (runtime > period && runtime != RUNTIME_INF) 7415 if (runtime > period && runtime != RUNTIME_INF)
7168 return -EINVAL; 7416 return -EINVAL;
7169 7417
7418 bw = to_ratio(period, runtime);
7419 if (!__sched_rt_dl_global_constraints(bw))
7420 return -EINVAL;
7421
7170 mutex_lock(&rt_constraints_mutex); 7422 mutex_lock(&rt_constraints_mutex);
7171 read_lock(&tasklist_lock); 7423 read_lock(&tasklist_lock);
7172 ret = __rt_schedulable(NULL, 0, 0); 7424 ret = __rt_schedulable(NULL, 0, 0);
@@ -7189,19 +7441,19 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7189static int sched_rt_global_constraints(void) 7441static int sched_rt_global_constraints(void)
7190{ 7442{
7191 unsigned long flags; 7443 unsigned long flags;
7192 int i; 7444 int i, ret = 0;
7445 u64 bw;
7193 7446
7194 if (sysctl_sched_rt_period <= 0) 7447 if (sysctl_sched_rt_period <= 0)
7195 return -EINVAL; 7448 return -EINVAL;
7196 7449
7197 /*
7198 * There's always some RT tasks in the root group
7199 * -- migration, kstopmachine etc..
7200 */
7201 if (sysctl_sched_rt_runtime == 0)
7202 return -EBUSY;
7203
7204 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7450 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7451 bw = to_ratio(global_rt_period(), global_rt_runtime());
7452 if (!__sched_rt_dl_global_constraints(bw)) {
7453 ret = -EINVAL;
7454 goto unlock;
7455 }
7456
7205 for_each_possible_cpu(i) { 7457 for_each_possible_cpu(i) {
7206 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7458 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7207 7459
@@ -7209,12 +7461,93 @@ static int sched_rt_global_constraints(void)
7209 rt_rq->rt_runtime = global_rt_runtime(); 7461 rt_rq->rt_runtime = global_rt_runtime();
7210 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7462 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7211 } 7463 }
7464unlock:
7212 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7465 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7213 7466
7214 return 0; 7467 return ret;
7215} 7468}
7216#endif /* CONFIG_RT_GROUP_SCHED */ 7469#endif /* CONFIG_RT_GROUP_SCHED */
7217 7470
7471/*
7472 * Coupling of -dl and -rt bandwidth.
7473 *
7474 * Here we check, while setting the system wide bandwidth available
7475 * for -dl tasks and groups, if the new values are consistent with
7476 * the system settings for the bandwidth available to -rt entities.
7477 *
7478 * IOW, we want to enforce that
7479 *
7480 * rt_bandwidth + dl_bandwidth <= 100%
7481 *
7482 * is always true.
7483 */
7484static bool __sched_dl_rt_global_constraints(u64 dl_bw)
7485{
7486 u64 rt_bw;
7487 bool ret;
7488
7489 raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
7490 if (global_dl_runtime() == RUNTIME_INF ||
7491 global_rt_runtime() == RUNTIME_INF) {
7492 ret = true;
7493 goto unlock;
7494 }
7495
7496 rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period),
7497 def_rt_bandwidth.rt_runtime);
7498
7499 ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
7500unlock:
7501 raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
7502
7503 return ret;
7504}
7505
7506static bool __sched_dl_global_constraints(u64 runtime, u64 period)
7507{
7508 if (!period || (runtime != RUNTIME_INF && runtime > period))
7509 return -EINVAL;
7510
7511 return 0;
7512}
7513
7514static int sched_dl_global_constraints(void)
7515{
7516 u64 runtime = global_dl_runtime();
7517 u64 period = global_dl_period();
7518 u64 new_bw = to_ratio(period, runtime);
7519 int ret, i;
7520
7521 ret = __sched_dl_global_constraints(runtime, period);
7522 if (ret)
7523 return ret;
7524
7525 if (!__sched_dl_rt_global_constraints(new_bw))
7526 return -EINVAL;
7527
7528 /*
7529 * Here we want to check the bandwidth not being set to some
7530 * value smaller than the currently allocated bandwidth in
7531 * any of the root_domains.
7532 *
7533 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
7534 * cycling on root_domains... Discussion on different/better
7535 * solutions is welcome!
7536 */
7537 for_each_possible_cpu(i) {
7538 struct dl_bw *dl_b = dl_bw_of(i);
7539
7540 raw_spin_lock(&dl_b->lock);
7541 if (new_bw < dl_b->total_bw) {
7542 raw_spin_unlock(&dl_b->lock);
7543 return -EBUSY;
7544 }
7545 raw_spin_unlock(&dl_b->lock);
7546 }
7547
7548 return 0;
7549}
7550
7218int sched_rr_handler(struct ctl_table *table, int write, 7551int sched_rr_handler(struct ctl_table *table, int write,
7219 void __user *buffer, size_t *lenp, 7552 void __user *buffer, size_t *lenp,
7220 loff_t *ppos) 7553 loff_t *ppos)
@@ -7264,6 +7597,60 @@ int sched_rt_handler(struct ctl_table *table, int write,
7264 return ret; 7597 return ret;
7265} 7598}
7266 7599
7600int sched_dl_handler(struct ctl_table *table, int write,
7601 void __user *buffer, size_t *lenp,
7602 loff_t *ppos)
7603{
7604 int ret;
7605 int old_period, old_runtime;
7606 static DEFINE_MUTEX(mutex);
7607 unsigned long flags;
7608
7609 mutex_lock(&mutex);
7610 old_period = sysctl_sched_dl_period;
7611 old_runtime = sysctl_sched_dl_runtime;
7612
7613 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7614
7615 if (!ret && write) {
7616 raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock,
7617 flags);
7618
7619 ret = sched_dl_global_constraints();
7620 if (ret) {
7621 sysctl_sched_dl_period = old_period;
7622 sysctl_sched_dl_runtime = old_runtime;
7623 } else {
7624 u64 new_bw;
7625 int i;
7626
7627 def_dl_bandwidth.dl_period = global_dl_period();
7628 def_dl_bandwidth.dl_runtime = global_dl_runtime();
7629 if (global_dl_runtime() == RUNTIME_INF)
7630 new_bw = -1;
7631 else
7632 new_bw = to_ratio(global_dl_period(),
7633 global_dl_runtime());
7634 /*
7635 * FIXME: As above...
7636 */
7637 for_each_possible_cpu(i) {
7638 struct dl_bw *dl_b = dl_bw_of(i);
7639
7640 raw_spin_lock(&dl_b->lock);
7641 dl_b->bw = new_bw;
7642 raw_spin_unlock(&dl_b->lock);
7643 }
7644 }
7645
7646 raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
7647 flags);
7648 }
7649 mutex_unlock(&mutex);
7650
7651 return ret;
7652}
7653
7267#ifdef CONFIG_CGROUP_SCHED 7654#ifdef CONFIG_CGROUP_SCHED
7268 7655
7269static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7656static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7f6de4316990..802188fb6338 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
16 */ 16 */
17#include "sched.h" 17#include "sched.h"
18 18
19struct dl_bandwidth def_dl_bandwidth;
20
19static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 21static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
20{ 22{
21 return container_of(dl_se, struct task_struct, dl); 23 return container_of(dl_se, struct task_struct, dl);
@@ -46,6 +48,27 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
46 return dl_rq->rb_leftmost == &dl_se->rb_node; 48 return dl_rq->rb_leftmost == &dl_se->rb_node;
47} 49}
48 50
51void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
52{
53 raw_spin_lock_init(&dl_b->dl_runtime_lock);
54 dl_b->dl_period = period;
55 dl_b->dl_runtime = runtime;
56}
57
58extern unsigned long to_ratio(u64 period, u64 runtime);
59
60void init_dl_bw(struct dl_bw *dl_b)
61{
62 raw_spin_lock_init(&dl_b->lock);
63 raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
64 if (global_dl_runtime() == RUNTIME_INF)
65 dl_b->bw = -1;
66 else
67 dl_b->bw = to_ratio(global_dl_period(), global_dl_runtime());
68 raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
69 dl_b->total_bw = 0;
70}
71
49void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) 72void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
50{ 73{
51 dl_rq->rb_root = RB_ROOT; 74 dl_rq->rb_root = RB_ROOT;
@@ -57,6 +80,8 @@ void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
57 dl_rq->dl_nr_migratory = 0; 80 dl_rq->dl_nr_migratory = 0;
58 dl_rq->overloaded = 0; 81 dl_rq->overloaded = 0;
59 dl_rq->pushable_dl_tasks_root = RB_ROOT; 82 dl_rq->pushable_dl_tasks_root = RB_ROOT;
83#else
84 init_dl_bw(&dl_rq->dl_bw);
60#endif 85#endif
61} 86}
62 87
@@ -359,8 +384,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
359 * of anything below microseconds resolution is actually fiction 384 * of anything below microseconds resolution is actually fiction
360 * (but still we want to give the user that illusion >;). 385 * (but still we want to give the user that illusion >;).
361 */ 386 */
362 left = (pi_se->dl_period >> 10) * (dl_se->runtime >> 10); 387 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
363 right = ((dl_se->deadline - t) >> 10) * (pi_se->dl_runtime >> 10); 388 right = ((dl_se->deadline - t) >> DL_SCALE) *
389 (pi_se->dl_runtime >> DL_SCALE);
364 390
365 return dl_time_before(right, left); 391 return dl_time_before(right, left);
366} 392}
@@ -911,8 +937,8 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
911 * In the unlikely case current and p have the same deadline 937 * In the unlikely case current and p have the same deadline
912 * let us try to decide what's the best thing to do... 938 * let us try to decide what's the best thing to do...
913 */ 939 */
914 if ((s64)(p->dl.deadline - rq->curr->dl.deadline) == 0 && 940 if ((p->dl.deadline == rq->curr->dl.deadline) &&
915 !need_resched()) 941 !test_tsk_need_resched(rq->curr))
916 check_preempt_equal_dl(rq, p); 942 check_preempt_equal_dl(rq, p);
917#endif /* CONFIG_SMP */ 943#endif /* CONFIG_SMP */
918} 944}
@@ -1000,6 +1026,14 @@ static void task_fork_dl(struct task_struct *p)
1000static void task_dead_dl(struct task_struct *p) 1026static void task_dead_dl(struct task_struct *p)
1001{ 1027{
1002 struct hrtimer *timer = &p->dl.dl_timer; 1028 struct hrtimer *timer = &p->dl.dl_timer;
1029 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1030
1031 /*
1032 * Since we are TASK_DEAD we won't slip out of the domain!
1033 */
1034 raw_spin_lock_irq(&dl_b->lock);
1035 dl_b->total_bw -= p->dl.dl_bw;
1036 raw_spin_unlock_irq(&dl_b->lock);
1003 1037
1004 hrtimer_cancel(timer); 1038 hrtimer_cancel(timer);
1005} 1039}
@@ -1226,7 +1260,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1226 BUG_ON(task_current(rq, p)); 1260 BUG_ON(task_current(rq, p));
1227 BUG_ON(p->nr_cpus_allowed <= 1); 1261 BUG_ON(p->nr_cpus_allowed <= 1);
1228 1262
1229 BUG_ON(!p->se.on_rq); 1263 BUG_ON(!p->on_rq);
1230 BUG_ON(!dl_task(p)); 1264 BUG_ON(!dl_task(p));
1231 1265
1232 return p; 1266 return p;
@@ -1373,7 +1407,7 @@ static int pull_dl_task(struct rq *this_rq)
1373 dl_time_before(p->dl.deadline, 1407 dl_time_before(p->dl.deadline,
1374 this_rq->dl.earliest_dl.curr))) { 1408 this_rq->dl.earliest_dl.curr))) {
1375 WARN_ON(p == src_rq->curr); 1409 WARN_ON(p == src_rq->curr);
1376 WARN_ON(!p->se.on_rq); 1410 WARN_ON(!p->on_rq);
1377 1411
1378 /* 1412 /*
1379 * Then we pull iff p has actually an earlier 1413 * Then we pull iff p has actually an earlier
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 52453a2d0a79..ad4f4fbd002e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,6 +74,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
74#define NICE_0_SHIFT SCHED_LOAD_SHIFT 74#define NICE_0_SHIFT SCHED_LOAD_SHIFT
75 75
76/* 76/*
77 * Single value that decides SCHED_DEADLINE internal math precision.
78 * 10 -> just above 1us
79 * 9 -> just above 0.5us
80 */
81#define DL_SCALE (10)
82
83/*
77 * These are the 'tuning knobs' of the scheduler: 84 * These are the 'tuning knobs' of the scheduler:
78 */ 85 */
79 86
@@ -107,7 +114,7 @@ static inline int task_has_dl_policy(struct task_struct *p)
107 return dl_policy(p->policy); 114 return dl_policy(p->policy);
108} 115}
109 116
110static inline int dl_time_before(u64 a, u64 b) 117static inline bool dl_time_before(u64 a, u64 b)
111{ 118{
112 return (s64)(a - b) < 0; 119 return (s64)(a - b) < 0;
113} 120}
@@ -115,8 +122,8 @@ static inline int dl_time_before(u64 a, u64 b)
115/* 122/*
116 * Tells if entity @a should preempt entity @b. 123 * Tells if entity @a should preempt entity @b.
117 */ 124 */
118static inline 125static inline bool
119int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) 126dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
120{ 127{
121 return dl_time_before(a->deadline, b->deadline); 128 return dl_time_before(a->deadline, b->deadline);
122} 129}
@@ -136,6 +143,50 @@ struct rt_bandwidth {
136 u64 rt_runtime; 143 u64 rt_runtime;
137 struct hrtimer rt_period_timer; 144 struct hrtimer rt_period_timer;
138}; 145};
146/*
147 * To keep the bandwidth of -deadline tasks and groups under control
148 * we need some place where:
149 * - store the maximum -deadline bandwidth of the system (the group);
150 * - cache the fraction of that bandwidth that is currently allocated.
151 *
152 * This is all done in the data structure below. It is similar to the
153 * one used for RT-throttling (rt_bandwidth), with the main difference
154 * that, since here we are only interested in admission control, we
155 * do not decrease any runtime while the group "executes", neither we
156 * need a timer to replenish it.
157 *
158 * With respect to SMP, the bandwidth is given on a per-CPU basis,
159 * meaning that:
160 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
161 * - dl_total_bw array contains, in the i-eth element, the currently
162 * allocated bandwidth on the i-eth CPU.
163 * Moreover, groups consume bandwidth on each CPU, while tasks only
164 * consume bandwidth on the CPU they're running on.
165 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
166 * that will be shown the next time the proc or cgroup controls will
167 * be red. It on its turn can be changed by writing on its own
168 * control.
169 */
170struct dl_bandwidth {
171 raw_spinlock_t dl_runtime_lock;
172 u64 dl_runtime;
173 u64 dl_period;
174};
175
176static inline int dl_bandwidth_enabled(void)
177{
178 return sysctl_sched_dl_runtime >= 0;
179}
180
181extern struct dl_bw *dl_bw_of(int i);
182
183struct dl_bw {
184 raw_spinlock_t lock;
185 u64 bw, total_bw;
186};
187
188static inline u64 global_dl_period(void);
189static inline u64 global_dl_runtime(void);
139 190
140extern struct mutex sched_domains_mutex; 191extern struct mutex sched_domains_mutex;
141 192
@@ -423,6 +474,8 @@ struct dl_rq {
423 */ 474 */
424 struct rb_root pushable_dl_tasks_root; 475 struct rb_root pushable_dl_tasks_root;
425 struct rb_node *pushable_dl_tasks_leftmost; 476 struct rb_node *pushable_dl_tasks_leftmost;
477#else
478 struct dl_bw dl_bw;
426#endif 479#endif
427}; 480};
428 481
@@ -449,6 +502,7 @@ struct root_domain {
449 */ 502 */
450 cpumask_var_t dlo_mask; 503 cpumask_var_t dlo_mask;
451 atomic_t dlo_count; 504 atomic_t dlo_count;
505 struct dl_bw dl_bw;
452 506
453 /* 507 /*
454 * The "RT overload" flag: it gets set if a CPU has more than 508 * The "RT overload" flag: it gets set if a CPU has more than
@@ -897,7 +951,18 @@ static inline u64 global_rt_runtime(void)
897 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 951 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
898} 952}
899 953
954static inline u64 global_dl_period(void)
955{
956 return (u64)sysctl_sched_dl_period * NSEC_PER_USEC;
957}
958
959static inline u64 global_dl_runtime(void)
960{
961 if (sysctl_sched_dl_runtime < 0)
962 return RUNTIME_INF;
900 963
964 return (u64)sysctl_sched_dl_runtime * NSEC_PER_USEC;
965}
901 966
902static inline int task_current(struct rq *rq, struct task_struct *p) 967static inline int task_current(struct rq *rq, struct task_struct *p)
903{ 968{
@@ -1145,6 +1210,7 @@ extern void update_max_interval(void);
1145extern void init_sched_dl_class(void); 1210extern void init_sched_dl_class(void);
1146extern void init_sched_rt_class(void); 1211extern void init_sched_rt_class(void);
1147extern void init_sched_fair_class(void); 1212extern void init_sched_fair_class(void);
1213extern void init_sched_dl_class(void);
1148 1214
1149extern void resched_task(struct task_struct *p); 1215extern void resched_task(struct task_struct *p);
1150extern void resched_cpu(int cpu); 1216extern void resched_cpu(int cpu);
@@ -1152,8 +1218,12 @@ extern void resched_cpu(int cpu);
1152extern struct rt_bandwidth def_rt_bandwidth; 1218extern struct rt_bandwidth def_rt_bandwidth;
1153extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1219extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1154 1220
1221extern struct dl_bandwidth def_dl_bandwidth;
1222extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1155extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1223extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1156 1224
1225unsigned long to_ratio(u64 period, u64 runtime);
1226
1157extern void update_idle_cpu_load(struct rq *this_rq); 1227extern void update_idle_cpu_load(struct rq *this_rq);
1158 1228
1159extern void init_task_runnable_average(struct task_struct *p); 1229extern void init_task_runnable_average(struct task_struct *p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8da99f905cf..c7fb0790ac63 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -414,6 +414,20 @@ static struct ctl_table kern_table[] = {
414 .mode = 0644, 414 .mode = 0644,
415 .proc_handler = sched_rr_handler, 415 .proc_handler = sched_rr_handler,
416 }, 416 },
417 {
418 .procname = "sched_dl_period_us",
419 .data = &sysctl_sched_dl_period,
420 .maxlen = sizeof(unsigned int),
421 .mode = 0644,
422 .proc_handler = sched_dl_handler,
423 },
424 {
425 .procname = "sched_dl_runtime_us",
426 .data = &sysctl_sched_dl_runtime,
427 .maxlen = sizeof(int),
428 .mode = 0644,
429 .proc_handler = sched_dl_handler,
430 },
417#ifdef CONFIG_SCHED_AUTOGROUP 431#ifdef CONFIG_SCHED_AUTOGROUP
418 { 432 {
419 .procname = "sched_autogroup_enabled", 433 .procname = "sched_autogroup_enabled",