1 files changed, 414 insertions, 27 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 599ee3b11b44..c7c68e6b5c51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,6 +296,15 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
+/*
+ * Maximum bandwidth available for all -deadline tasks and groups
+ * (if group scheduling is configured) on each CPU.
+ *
+ * default: 5%
+ */
+unsigned int sysctl_sched_dl_period = 1000000;
+int sysctl_sched_dl_runtime = 50000;
 /*
@@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
        return 0;
 }
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+        if (runtime == RUNTIME_INF)
+                return 1ULL << 20;
+        /*
+         * Doing this here saves a lot of checks in all
+         * the calling paths, and returning zero seems
+         * safe for them anyway.
+         */
+        if (period == 0)
+                return 0;
+        return div64_u64(runtime << 20, period);
+}
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+        return &cpu_rq(i)->rd->dl_bw;
+}
+static inline int __dl_span_weight(struct rq *rq)
+{
+        return cpumask_weight(rq->rd->span);
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+        return &cpu_rq(i)->dl.dl_bw;
+}
+static inline int __dl_span_weight(struct rq *rq)
+{
+        return 1;
+}
+#endif
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw -= tsk_bw;
+}
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw += tsk_bw;
+}
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+        return dl_b->bw != -1 &&
+               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                       const struct sched_attr *attr)
+{
+        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+        u64 period = attr->sched_period;
+        u64 runtime = attr->sched_runtime;
+        u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+        int cpus = __dl_span_weight(task_rq(p));
+        int err = -1;
+        if (new_bw == p->dl.dl_bw)
+                return 0;
+        /*
+         * Either if a task, enters, leave, or stays -deadline but changes
+         * its parameters, we may need to update accordingly the total
+         * allocated bandwidth of the container.
+         */
+        raw_spin_lock(&dl_b->lock);
+        if (dl_policy(policy) && !task_has_dl_policy(p) &&
+            !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+                __dl_add(dl_b, new_bw);
+                err = 0;
+        } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+                __dl_clear(dl_b, p->dl.dl_bw);
+                __dl_add(dl_b, new_bw);
+                err = 0;
+        } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+                __dl_clear(dl_b, p->dl.dl_bw);
+                err = 0;
+        }
+        raw_spin_unlock(&dl_b->lock);
+        return err;
+}
+extern void init_dl_bw(struct dl_bw *dl_b);
 /*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
@@ -3053,6 +3167,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_deadline = attr->sched_deadline;
        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
        dl_se->flags = attr->sched_flags;
+        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
        dl_se->dl_throttled = 0;
        dl_se->dl_new = 1;
 }
@@ -3101,7 +3216,9 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 * This function validates the new parameters of a -deadline task.
 * We ask for the deadline not being zero, and greater or equal
 * than the runtime, as well as the period of being zero or
- * greater than deadline.
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
 */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
@@ -3109,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr)
        return attr && attr->sched_deadline != 0 &&
                (attr->sched_period == 0 ||
                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
-                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0;
+                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
+                attr->sched_runtime >= (2 << (DL_SCALE - 1));
 }
 /*
@@ -3250,8 +3368,8 @@ recheck:
        }
 change:
-#ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                /*
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
@@ -3262,8 +3380,33 @@ change:
                        task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
-        }
 #endif
+#ifdef CONFIG_SMP
+                if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                        cpumask_t *span = rq->rd->span;
+                        cpumask_t act_affinity;
+                        /*
+                         * cpus_allowed mask is statically initialized with
+                         * CPU_MASK_ALL, span is instead dynamic. Here we
+                         * compute the "dynamic" affinity of a task.
+                         */
+                        cpumask_and(&act_affinity, &p->cpus_allowed,
+                                    cpu_active_mask);
+                        /*
+                         * Don't allow tasks with an affinity mask smaller than
+                         * the entire root_domain to become SCHED_DEADLINE. We
+                         * will also fail if there's no bandwidth available.
+                         */
+                        if (!cpumask_equal(&act_affinity, span) ||
+                                           rq->rd->dl_bw.bw == 0) {
+                                task_rq_unlock(rq, p, &flags);
+                                return -EPERM;
+                        }
+                }
+#endif
+        }
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3271,6 +3414,18 @@ change:
                task_rq_unlock(rq, p, &flags);
                goto recheck;
        }
+        /*
+         * If setscheduling to SCHED_DEADLINE (or changing the parameters
+         * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+         * is available.
+         */
+        if ((dl_policy(policy) || dl_task(p)) &&
+            dl_overflow(p, policy, attr)) {
+                task_rq_unlock(rq, p, &flags);
+                return -EBUSY;
+        }
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -3705,6 +3860,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        if (retval)
                goto out_unlock;
+        /*
+         * Since bandwidth control happens on root_domain basis,
+         * if admission test is enabled, we only admit -deadline
+         * tasks allowed to run on all the CPUs in the task's
+         * root_domain.
+         */
+#ifdef CONFIG_SMP
+        if (task_has_dl_policy(p)) {
+                const struct cpumask *span = task_rq(p)->rd->span;
+                if (dl_bandwidth_enabled() &&
+                    !cpumask_equal(in_mask, span)) {
+                        retval = -EBUSY;
+                        goto out_unlock;
+                }
+        }
+#endif
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
 again:
@@ -4359,6 +4532,42 @@ out:
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 /*
+ * When dealing with a -deadline task, we have to check if moving it to
+ * a new CPU is possible or not. In fact, this is only true iff there
+ * is enough bandwidth available on such CPU, otherwise we want the
+ * whole migration progedure to fail over.
+ */
+static inline
+bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
+{
+        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+        struct dl_bw *cpu_b = dl_bw_of(cpu);
+        int ret = 1;
+        u64 bw;
+        if (dl_b == cpu_b)
+                return 1;
+        raw_spin_lock(&dl_b->lock);
+        raw_spin_lock(&cpu_b->lock);
+        bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span);
+        if (dl_bandwidth_enabled() &&
+            bw < cpu_b->total_bw + p->dl.dl_bw) {
+                ret = 0;
+                goto unlock;
+        }
+        dl_b->total_bw -= p->dl.dl_bw;
+        cpu_b->total_bw += p->dl.dl_bw;
+unlock:
+        raw_spin_unlock(&cpu_b->lock);
+        raw_spin_unlock(&dl_b->lock);
+        return ret;
+}
+/*
 * Move (not current) task off this cpu, onto dest cpu. We're doing
 * this because either it can't run here any more (set_cpus_allowed()
 * away from this CPU, or CPU going down), or because we're
@@ -4390,6 +4599,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                goto fail;
        /*
+         * If p is -deadline, proceed only if there is enough
+         * bandwidth available on dest_cpu
+         */
+        if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu))
+                goto fail;
+        /*
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
@@ -5128,6 +5344,8 @@ static int init_rootdomain(struct root_domain *rd)
        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                goto free_dlo_mask;
+        init_dl_bw(&rd->dl_bw);
        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
        return 0;
@@ -6557,13 +6775,15 @@ void __init sched_init(void)
 #endif /* CONFIG_CPUMASK_OFFSTACK */
        }
+        init_rt_bandwidth(&def_rt_bandwidth,
+                        global_rt_period(), global_rt_runtime());
+        init_dl_bandwidth(&def_dl_bandwidth,
+                        global_dl_period(), global_dl_runtime());
 #ifdef CONFIG_SMP
        init_defrootdomain();
 #endif
-        init_rt_bandwidth(&def_rt_bandwidth,
-                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
@@ -6966,16 +7186,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-        if (runtime == RUNTIME_INF)
-                return 1ULL << 20;
-        return div64_u64(runtime << 20, period);
-}
-#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
 * Ensure that the real time constraints are schedulable.
@@ -7149,10 +7359,48 @@ static long sched_group_rt_period(struct task_group *tg)
        do_div(rt_period_us, NSEC_PER_USEC);
        return rt_period_us;
 }
+#endif /* CONFIG_RT_GROUP_SCHED */
+/*
+ * Coupling of -rt and -deadline bandwidth.
+ *
+ * Here we check if the new -rt bandwidth value is consistent
+ * with the system settings for the bandwidth available
+ * to -deadline tasks.
+ *
+ * IOW, we want to enforce that
+ *
+ *   rt_bandwidth + dl_bandwidth <= 100%
+ *
+ * is always true.
+ */
+static bool __sched_rt_dl_global_constraints(u64 rt_bw)
+{
+        unsigned long flags;
+        u64 dl_bw;
+        bool ret;
+        raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags);
+        if (global_rt_runtime() == RUNTIME_INF ||
+            global_dl_runtime() == RUNTIME_INF) {
+                ret = true;
+                goto unlock;
+        }
+        dl_bw = to_ratio(def_dl_bandwidth.dl_period,
+                         def_dl_bandwidth.dl_runtime);
+        ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
+unlock:
+        raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags);
+        return ret;
+}
+#ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
-        u64 runtime, period;
+        u64 runtime, period, bw;
        int ret = 0;
        if (sysctl_sched_rt_period <= 0)
@@ -7167,6 +7415,10 @@ static int sched_rt_global_constraints(void)
        if (runtime > period && runtime != RUNTIME_INF)
                return -EINVAL;
+        bw = to_ratio(period, runtime);
+        if (!__sched_rt_dl_global_constraints(bw))
+                return -EINVAL;
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        ret = __rt_schedulable(NULL, 0, 0);
@@ -7189,19 +7441,19 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
-        int i;
+        int i, ret = 0;
+        u64 bw;
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
-        /*
-         * There's always some RT tasks in the root group
-         * -- migration, kstopmachine etc..
-         */
-        if (sysctl_sched_rt_runtime == 0)
-                return -EBUSY;
        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+        bw = to_ratio(global_rt_period(), global_rt_runtime());
+        if (!__sched_rt_dl_global_constraints(bw)) {
+                ret = -EINVAL;
+                goto unlock;
+        }
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -7209,12 +7461,93 @@ static int sched_rt_global_constraints(void)
                rt_rq->rt_runtime = global_rt_runtime();
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
+unlock:
        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
+/*
+ * Coupling of -dl and -rt bandwidth.
+ *
+ * Here we check, while setting the system wide bandwidth available
+ * for -dl tasks and groups, if the new values are consistent with
+ * the system settings for the bandwidth available to -rt entities.
+ *
+ * IOW, we want to enforce that
+ *
+ *   rt_bandwidth + dl_bandwidth <= 100%
+ *
+ * is always true.
+ */
+static bool __sched_dl_rt_global_constraints(u64 dl_bw)
+{
+        u64 rt_bw;
+        bool ret;
+        raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
+        if (global_dl_runtime() == RUNTIME_INF ||
+            global_rt_runtime() == RUNTIME_INF) {
+                ret = true;
+                goto unlock;
+        }
+        rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period),
+                         def_rt_bandwidth.rt_runtime);
+        ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
+unlock:
+        raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
+        return ret;
+}
+static bool __sched_dl_global_constraints(u64 runtime, u64 period)
+{
+        if (!period || (runtime != RUNTIME_INF && runtime > period))
+                return -EINVAL;
+        return 0;
+}
+static int sched_dl_global_constraints(void)
+{
+        u64 runtime = global_dl_runtime();
+        u64 period = global_dl_period();
+        u64 new_bw = to_ratio(period, runtime);
+        int ret, i;
+        ret = __sched_dl_global_constraints(runtime, period);
+        if (ret)
+                return ret;
+        if (!__sched_dl_rt_global_constraints(new_bw))
+                return -EINVAL;
+        /*
+         * Here we want to check the bandwidth not being set to some
+         * value smaller than the currently allocated bandwidth in
+         * any of the root_domains.
+         *
+         * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+         * cycling on root_domains... Discussion on different/better
+         * solutions is welcome!
+         */
+        for_each_possible_cpu(i) {
+                struct dl_bw *dl_b = dl_bw_of(i);
+                raw_spin_lock(&dl_b->lock);
+                if (new_bw < dl_b->total_bw) {
+                        raw_spin_unlock(&dl_b->lock);
+                        return -EBUSY;
+                }
+                raw_spin_unlock(&dl_b->lock);
+        }
+        return 0;
+}
 int sched_rr_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -7264,6 +7597,60 @@ int sched_rt_handler(struct ctl_table *table, int write,
        return ret;
 }
+int sched_dl_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        int old_period, old_runtime;
+        static DEFINE_MUTEX(mutex);
+        unsigned long flags;
+        mutex_lock(&mutex);
+        old_period = sysctl_sched_dl_period;
+        old_runtime = sysctl_sched_dl_runtime;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock,
+                                      flags);
+                ret = sched_dl_global_constraints();
+                if (ret) {
+                        sysctl_sched_dl_period = old_period;
+                        sysctl_sched_dl_runtime = old_runtime;
+                } else {
+                        u64 new_bw;
+                        int i;
+                        def_dl_bandwidth.dl_period = global_dl_period();
+                        def_dl_bandwidth.dl_runtime = global_dl_runtime();
+                        if (global_dl_runtime() == RUNTIME_INF)
+                                new_bw = -1;
+                        else
+                                new_bw = to_ratio(global_dl_period(),
+                                                  global_dl_runtime());
+                        /*
+                         * FIXME: As above...
+                         */
+                        for_each_possible_cpu(i) {
+                                struct dl_bw *dl_b = dl_bw_of(i);
+                                raw_spin_lock(&dl_b->lock);
+                                dl_b->bw = new_bw;
+                                raw_spin_unlock(&dl_b->lock);
+                        }
+                }
+                raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
+                                           flags);
+        }
+        mutex_unlock(&mutex);
+        return ret;
+}
 #ifdef CONFIG_CGROUP_SCHED
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 599ee3b11b44..c7c68e6b5c51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -296,6 +296,15 @@ __read_mostly int scheduler_running;
296	*/	296	*/
297	int sysctl_sched_rt_runtime = 950000;	297	int sysctl_sched_rt_runtime = 950000;
298		298
		299	/*
		300	* Maximum bandwidth available for all -deadline tasks and groups
		301	* (if group scheduling is configured) on each CPU.
		302	*
		303	* default: 5%
		304	*/
		305	unsigned int sysctl_sched_dl_period = 1000000;
		306	int sysctl_sched_dl_runtime = 50000;
		307
299		308
300		309
301	/*	310	/*
@@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
1856	return 0;	1865	return 0;
1857	}	1866	}
1858		1867
		1868	unsigned long to_ratio(u64 period, u64 runtime)
		1869	{
		1870	if (runtime == RUNTIME_INF)
		1871	return 1ULL << 20;
		1872
		1873	/*
		1874	* Doing this here saves a lot of checks in all
		1875	* the calling paths, and returning zero seems
		1876	* safe for them anyway.
		1877	*/
		1878	if (period == 0)
		1879	return 0;
		1880
		1881	return div64_u64(runtime << 20, period);
		1882	}
		1883
		1884	#ifdef CONFIG_SMP
		1885	inline struct dl_bw *dl_bw_of(int i)
		1886	{
		1887	return &cpu_rq(i)->rd->dl_bw;
		1888	}
		1889
		1890	static inline int __dl_span_weight(struct rq *rq)
		1891	{
		1892	return cpumask_weight(rq->rd->span);
		1893	}
		1894	#else
		1895	inline struct dl_bw *dl_bw_of(int i)
		1896	{
		1897	return &cpu_rq(i)->dl.dl_bw;
		1898	}
		1899
		1900	static inline int __dl_span_weight(struct rq *rq)
		1901	{
		1902	return 1;
		1903	}
		1904	#endif
		1905
		1906	static inline
		1907	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
		1908	{
		1909	dl_b->total_bw -= tsk_bw;
		1910	}
		1911
		1912	static inline
		1913	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
		1914	{
		1915	dl_b->total_bw += tsk_bw;
		1916	}
		1917
		1918	static inline
		1919	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
		1920	{
		1921	return dl_b->bw != -1 &&
		1922	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
		1923	}
		1924
		1925	/*
		1926	* We must be sure that accepting a new task (or allowing changing the
		1927	* parameters of an existing one) is consistent with the bandwidth
		1928	* constraints. If yes, this function also accordingly updates the currently
		1929	* allocated bandwidth to reflect the new situation.
		1930	*
		1931	* This function is called while holding p's rq->lock.
		1932	*/
		1933	static int dl_overflow(struct task_struct *p, int policy,
		1934	const struct sched_attr *attr)
		1935	{
		1936
		1937	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
		1938	u64 period = attr->sched_period;
		1939	u64 runtime = attr->sched_runtime;
		1940	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
		1941	int cpus = __dl_span_weight(task_rq(p));
		1942	int err = -1;
		1943
		1944	if (new_bw == p->dl.dl_bw)
		1945	return 0;
		1946
		1947	/*
		1948	* Either if a task, enters, leave, or stays -deadline but changes
		1949	* its parameters, we may need to update accordingly the total
		1950	* allocated bandwidth of the container.
		1951	*/
		1952	raw_spin_lock(&dl_b->lock);
		1953	if (dl_policy(policy) && !task_has_dl_policy(p) &&
		1954	!__dl_overflow(dl_b, cpus, 0, new_bw)) {
		1955	__dl_add(dl_b, new_bw);
		1956	err = 0;
		1957	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
		1958	!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
		1959	__dl_clear(dl_b, p->dl.dl_bw);
		1960	__dl_add(dl_b, new_bw);
		1961	err = 0;
		1962	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
		1963	__dl_clear(dl_b, p->dl.dl_bw);
		1964	err = 0;
		1965	}
		1966	raw_spin_unlock(&dl_b->lock);
		1967
		1968	return err;
		1969	}
		1970
		1971	extern void init_dl_bw(struct dl_bw *dl_b);
		1972
1859	/*	1973	/*
1860	* wake_up_new_task - wake up a newly created task for the first time.	1974	* wake_up_new_task - wake up a newly created task for the first time.
1861	*	1975	*
@@ -3053,6 +3167,7 @@ __setparam_dl(struct task_struct p, const struct sched_attr attr)
3053	dl_se->dl_deadline = attr->sched_deadline;	3167	dl_se->dl_deadline = attr->sched_deadline;
3054	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;	3168	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3055	dl_se->flags = attr->sched_flags;	3169	dl_se->flags = attr->sched_flags;
		3170	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3056	dl_se->dl_throttled = 0;	3171	dl_se->dl_throttled = 0;
3057	dl_se->dl_new = 1;	3172	dl_se->dl_new = 1;
3058	}	3173	}
@@ -3101,7 +3216,9 @@ __getparam_dl(struct task_struct p, struct sched_attr attr)
3101	* This function validates the new parameters of a -deadline task.	3216	* This function validates the new parameters of a -deadline task.
3102	* We ask for the deadline not being zero, and greater or equal	3217	* We ask for the deadline not being zero, and greater or equal
3103	* than the runtime, as well as the period of being zero or	3218	* than the runtime, as well as the period of being zero or
3104	* greater than deadline.	3219	* greater than deadline. Furthermore, we have to be sure that
		3220	* user parameters are above the internal resolution (1us); we
		3221	* check sched_runtime only since it is always the smaller one.
3105	*/	3222	*/
3106	static bool	3223	static bool
3107	__checkparam_dl(const struct sched_attr *attr)	3224	__checkparam_dl(const struct sched_attr *attr)
@@ -3109,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr)
3109	return attr && attr->sched_deadline != 0 &&	3226	return attr && attr->sched_deadline != 0 &&
3110	(attr->sched_period == 0 \|\|	3227	(attr->sched_period == 0 \|\|
3111	(s64)(attr->sched_period - attr->sched_deadline) >= 0) &&	3228	(s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3112	(s64)(attr->sched_deadline - attr->sched_runtime ) >= 0;	3229	(s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
		3230	attr->sched_runtime >= (2 << (DL_SCALE - 1));
3113	}	3231	}
3114		3232
3115	/*	3233	/*
@@ -3250,8 +3368,8 @@ recheck:
3250	}	3368	}
3251	change:	3369	change:
3252		3370
3253	#ifdef CONFIG_RT_GROUP_SCHED
3254	if (user) {	3371	if (user) {
		3372	#ifdef CONFIG_RT_GROUP_SCHED
3255	/*	3373	/*
3256	* Do not allow realtime tasks into groups that have no runtime	3374	* Do not allow realtime tasks into groups that have no runtime
3257	* assigned.	3375	* assigned.
@@ -3262,8 +3380,33 @@ change:
3262	task_rq_unlock(rq, p, &flags);	3380	task_rq_unlock(rq, p, &flags);
3263	return -EPERM;	3381	return -EPERM;
3264	}	3382	}
3265	}
3266	#endif	3383	#endif
		3384	#ifdef CONFIG_SMP
		3385	if (dl_bandwidth_enabled() && dl_policy(policy)) {
		3386	cpumask_t *span = rq->rd->span;
		3387	cpumask_t act_affinity;
		3388
		3389	/*
		3390	* cpus_allowed mask is statically initialized with
		3391	* CPU_MASK_ALL, span is instead dynamic. Here we
		3392	* compute the "dynamic" affinity of a task.
		3393	*/
		3394	cpumask_and(&act_affinity, &p->cpus_allowed,
		3395	cpu_active_mask);
		3396
		3397	/*
		3398	* Don't allow tasks with an affinity mask smaller than
		3399	* the entire root_domain to become SCHED_DEADLINE. We
		3400	* will also fail if there's no bandwidth available.
		3401	*/
		3402	if (!cpumask_equal(&act_affinity, span) \|\|
		3403	rq->rd->dl_bw.bw == 0) {
		3404	task_rq_unlock(rq, p, &flags);
		3405	return -EPERM;
		3406	}
		3407	}
		3408	#endif
		3409	}
3267		3410
3268	/* recheck policy now with rq lock held */	3411	/* recheck policy now with rq lock held */
3269	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {	3412	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3271,6 +3414,18 @@ change:
3271	task_rq_unlock(rq, p, &flags);	3414	task_rq_unlock(rq, p, &flags);
3272	goto recheck;	3415	goto recheck;
3273	}	3416	}
		3417
		3418	/*
		3419	* If setscheduling to SCHED_DEADLINE (or changing the parameters
		3420	* of a SCHED_DEADLINE task) we need to check if enough bandwidth
		3421	* is available.
		3422	*/
		3423	if ((dl_policy(policy) \|\| dl_task(p)) &&
		3424	dl_overflow(p, policy, attr)) {
		3425	task_rq_unlock(rq, p, &flags);
		3426	return -EBUSY;
		3427	}
		3428
3274	on_rq = p->on_rq;	3429	on_rq = p->on_rq;
3275	running = task_current(rq, p);	3430	running = task_current(rq, p);
3276	if (on_rq)	3431	if (on_rq)
@@ -3705,6 +3860,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3705	if (retval)	3860	if (retval)
3706	goto out_unlock;	3861	goto out_unlock;
3707		3862
		3863	/*
		3864	* Since bandwidth control happens on root_domain basis,
		3865	* if admission test is enabled, we only admit -deadline
		3866	* tasks allowed to run on all the CPUs in the task's
		3867	* root_domain.
		3868	*/
		3869	#ifdef CONFIG_SMP
		3870	if (task_has_dl_policy(p)) {
		3871	const struct cpumask *span = task_rq(p)->rd->span;
		3872
		3873	if (dl_bandwidth_enabled() &&
		3874	!cpumask_equal(in_mask, span)) {
		3875	retval = -EBUSY;
		3876	goto out_unlock;
		3877	}
		3878	}
		3879	#endif
		3880
3708	cpuset_cpus_allowed(p, cpus_allowed);	3881	cpuset_cpus_allowed(p, cpus_allowed);
3709	cpumask_and(new_mask, in_mask, cpus_allowed);	3882	cpumask_and(new_mask, in_mask, cpus_allowed);
3710	again:	3883	again:
@@ -4359,6 +4532,42 @@ out:
4359	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);	4532	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4360		4533
4361	/*	4534	/*
		4535	* When dealing with a -deadline task, we have to check if moving it to
		4536	* a new CPU is possible or not. In fact, this is only true iff there
		4537	* is enough bandwidth available on such CPU, otherwise we want the
		4538	* whole migration progedure to fail over.
		4539	*/
		4540	static inline
		4541	bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
		4542	{
		4543	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
		4544	struct dl_bw *cpu_b = dl_bw_of(cpu);
		4545	int ret = 1;
		4546	u64 bw;
		4547
		4548	if (dl_b == cpu_b)
		4549	return 1;
		4550
		4551	raw_spin_lock(&dl_b->lock);
		4552	raw_spin_lock(&cpu_b->lock);
		4553
		4554	bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span);
		4555	if (dl_bandwidth_enabled() &&
		4556	bw < cpu_b->total_bw + p->dl.dl_bw) {
		4557	ret = 0;
		4558	goto unlock;
		4559	}
		4560	dl_b->total_bw -= p->dl.dl_bw;
		4561	cpu_b->total_bw += p->dl.dl_bw;
		4562
		4563	unlock:
		4564	raw_spin_unlock(&cpu_b->lock);
		4565	raw_spin_unlock(&dl_b->lock);
		4566
		4567	return ret;
		4568	}
		4569
		4570	/*
4362	* Move (not current) task off this cpu, onto dest cpu. We're doing	4571	* Move (not current) task off this cpu, onto dest cpu. We're doing
4363	* this because either it can't run here any more (set_cpus_allowed()	4572	* this because either it can't run here any more (set_cpus_allowed()
4364	* away from this CPU, or CPU going down), or because we're	4573	* away from this CPU, or CPU going down), or because we're
@@ -4390,6 +4599,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4390	goto fail;	4599	goto fail;
4391		4600
4392	/*	4601	/*
		4602	* If p is -deadline, proceed only if there is enough
		4603	* bandwidth available on dest_cpu
		4604	*/
		4605	if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu))
		4606	goto fail;
		4607
		4608	/*
4393	* If we're not on a rq, the next wake-up will ensure we're	4609	* If we're not on a rq, the next wake-up will ensure we're
4394	* placed properly.	4610	* placed properly.
4395	*/	4611	*/
@@ -5128,6 +5344,8 @@ static int init_rootdomain(struct root_domain *rd)
5128	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))	5344	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5129	goto free_dlo_mask;	5345	goto free_dlo_mask;
5130		5346
		5347	init_dl_bw(&rd->dl_bw);
		5348
5131	if (cpupri_init(&rd->cpupri) != 0)	5349	if (cpupri_init(&rd->cpupri) != 0)
5132	goto free_rto_mask;	5350	goto free_rto_mask;
5133	return 0;	5351	return 0;
@@ -6557,13 +6775,15 @@ void __init sched_init(void)
6557	#endif /* CONFIG_CPUMASK_OFFSTACK */	6775	#endif /* CONFIG_CPUMASK_OFFSTACK */
6558	}	6776	}
6559		6777
		6778	init_rt_bandwidth(&def_rt_bandwidth,
		6779	global_rt_period(), global_rt_runtime());
		6780	init_dl_bandwidth(&def_dl_bandwidth,
		6781	global_dl_period(), global_dl_runtime());
		6782
6560	#ifdef CONFIG_SMP	6783	#ifdef CONFIG_SMP
6561	init_defrootdomain();	6784	init_defrootdomain();
6562	#endif	6785	#endif
6563		6786
6564	init_rt_bandwidth(&def_rt_bandwidth,
6565	global_rt_period(), global_rt_runtime());
6566
6567	#ifdef CONFIG_RT_GROUP_SCHED	6787	#ifdef CONFIG_RT_GROUP_SCHED
6568	init_rt_bandwidth(&root_task_group.rt_bandwidth,	6788	init_rt_bandwidth(&root_task_group.rt_bandwidth,
6569	global_rt_period(), global_rt_runtime());	6789	global_rt_period(), global_rt_runtime());
@@ -6966,16 +7186,6 @@ void sched_move_task(struct task_struct *tsk)
6966	}	7186	}
6967	#endif /* CONFIG_CGROUP_SCHED */	7187	#endif /* CONFIG_CGROUP_SCHED */
6968		7188
6969	#if defined(CONFIG_RT_GROUP_SCHED) \|\| defined(CONFIG_CFS_BANDWIDTH)
6970	static unsigned long to_ratio(u64 period, u64 runtime)
6971	{
6972	if (runtime == RUNTIME_INF)
6973	return 1ULL << 20;
6974
6975	return div64_u64(runtime << 20, period);
6976	}
6977	#endif
6978
6979	#ifdef CONFIG_RT_GROUP_SCHED	7189	#ifdef CONFIG_RT_GROUP_SCHED
6980	/*	7190	/*
6981	* Ensure that the real time constraints are schedulable.	7191	* Ensure that the real time constraints are schedulable.
@@ -7149,10 +7359,48 @@ static long sched_group_rt_period(struct task_group *tg)
7149	do_div(rt_period_us, NSEC_PER_USEC);	7359	do_div(rt_period_us, NSEC_PER_USEC);
7150	return rt_period_us;	7360	return rt_period_us;
7151	}	7361	}
		7362	#endif /* CONFIG_RT_GROUP_SCHED */
7152		7363
		7364	/*
		7365	* Coupling of -rt and -deadline bandwidth.
		7366	*
		7367	* Here we check if the new -rt bandwidth value is consistent
		7368	* with the system settings for the bandwidth available
		7369	* to -deadline tasks.
		7370	*
		7371	* IOW, we want to enforce that
		7372	*
		7373	* rt_bandwidth + dl_bandwidth <= 100%
		7374	*
		7375	* is always true.
		7376	*/
		7377	static bool __sched_rt_dl_global_constraints(u64 rt_bw)
		7378	{
		7379	unsigned long flags;
		7380	u64 dl_bw;
		7381	bool ret;
		7382
		7383	raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags);
		7384	if (global_rt_runtime() == RUNTIME_INF \|\|
		7385	global_dl_runtime() == RUNTIME_INF) {
		7386	ret = true;
		7387	goto unlock;
		7388	}
		7389
		7390	dl_bw = to_ratio(def_dl_bandwidth.dl_period,
		7391	def_dl_bandwidth.dl_runtime);
		7392
		7393	ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
		7394	unlock:
		7395	raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags);
		7396
		7397	return ret;
		7398	}
		7399
		7400	#ifdef CONFIG_RT_GROUP_SCHED
7153	static int sched_rt_global_constraints(void)	7401	static int sched_rt_global_constraints(void)
7154	{	7402	{
7155	u64 runtime, period;	7403	u64 runtime, period, bw;
7156	int ret = 0;	7404	int ret = 0;
7157		7405
7158	if (sysctl_sched_rt_period <= 0)	7406	if (sysctl_sched_rt_period <= 0)
@@ -7167,6 +7415,10 @@ static int sched_rt_global_constraints(void)
7167	if (runtime > period && runtime != RUNTIME_INF)	7415	if (runtime > period && runtime != RUNTIME_INF)
7168	return -EINVAL;	7416	return -EINVAL;
7169		7417
		7418	bw = to_ratio(period, runtime);
		7419	if (!__sched_rt_dl_global_constraints(bw))
		7420	return -EINVAL;
		7421
7170	mutex_lock(&rt_constraints_mutex);	7422	mutex_lock(&rt_constraints_mutex);
7171	read_lock(&tasklist_lock);	7423	read_lock(&tasklist_lock);
7172	ret = __rt_schedulable(NULL, 0, 0);	7424	ret = __rt_schedulable(NULL, 0, 0);
@@ -7189,19 +7441,19 @@ static int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)
7189	static int sched_rt_global_constraints(void)	7441	static int sched_rt_global_constraints(void)
7190	{	7442	{
7191	unsigned long flags;	7443	unsigned long flags;
7192	int i;	7444	int i, ret = 0;
		7445	u64 bw;
7193		7446
7194	if (sysctl_sched_rt_period <= 0)	7447	if (sysctl_sched_rt_period <= 0)
7195	return -EINVAL;	7448	return -EINVAL;
7196		7449
7197	/*
7198	* There's always some RT tasks in the root group
7199	* -- migration, kstopmachine etc..
7200	*/
7201	if (sysctl_sched_rt_runtime == 0)
7202	return -EBUSY;
7203
7204	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);	7450	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
		7451	bw = to_ratio(global_rt_period(), global_rt_runtime());
		7452	if (!__sched_rt_dl_global_constraints(bw)) {
		7453	ret = -EINVAL;
		7454	goto unlock;
		7455	}
		7456
7205	for_each_possible_cpu(i) {	7457	for_each_possible_cpu(i) {
7206	struct rt_rq *rt_rq = &cpu_rq(i)->rt;	7458	struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7207		7459
@@ -7209,12 +7461,93 @@ static int sched_rt_global_constraints(void)
7209	rt_rq->rt_runtime = global_rt_runtime();	7461	rt_rq->rt_runtime = global_rt_runtime();
7210	raw_spin_unlock(&rt_rq->rt_runtime_lock);	7462	raw_spin_unlock(&rt_rq->rt_runtime_lock);
7211	}	7463	}
		7464	unlock:
7212	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);	7465	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7213		7466
7214	return 0;	7467	return ret;
7215	}	7468	}
7216	#endif /* CONFIG_RT_GROUP_SCHED */	7469	#endif /* CONFIG_RT_GROUP_SCHED */
7217		7470
		7471	/*
		7472	* Coupling of -dl and -rt bandwidth.
		7473	*
		7474	* Here we check, while setting the system wide bandwidth available
		7475	* for -dl tasks and groups, if the new values are consistent with
		7476	* the system settings for the bandwidth available to -rt entities.
		7477	*
		7478	* IOW, we want to enforce that
		7479	*
		7480	* rt_bandwidth + dl_bandwidth <= 100%
		7481	*
		7482	* is always true.
		7483	*/
		7484	static bool __sched_dl_rt_global_constraints(u64 dl_bw)
		7485	{
		7486	u64 rt_bw;
		7487	bool ret;
		7488
		7489	raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
		7490	if (global_dl_runtime() == RUNTIME_INF \|\|
		7491	global_rt_runtime() == RUNTIME_INF) {
		7492	ret = true;
		7493	goto unlock;
		7494	}
		7495
		7496	rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period),
		7497	def_rt_bandwidth.rt_runtime);
		7498
		7499	ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
		7500	unlock:
		7501	raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
		7502
		7503	return ret;
		7504	}
		7505
		7506	static bool __sched_dl_global_constraints(u64 runtime, u64 period)
		7507	{
		7508	if (!period \|\| (runtime != RUNTIME_INF && runtime > period))
		7509	return -EINVAL;
		7510
		7511	return 0;
		7512	}
		7513
		7514	static int sched_dl_global_constraints(void)
		7515	{
		7516	u64 runtime = global_dl_runtime();
		7517	u64 period = global_dl_period();
		7518	u64 new_bw = to_ratio(period, runtime);
		7519	int ret, i;
		7520
		7521	ret = __sched_dl_global_constraints(runtime, period);
		7522	if (ret)
		7523	return ret;
		7524
		7525	if (!__sched_dl_rt_global_constraints(new_bw))
		7526	return -EINVAL;
		7527
		7528	/*
		7529	* Here we want to check the bandwidth not being set to some
		7530	* value smaller than the currently allocated bandwidth in
		7531	* any of the root_domains.
		7532	*
		7533	* FIXME: Cycling on all the CPUs is overdoing, but simpler than
		7534	* cycling on root_domains... Discussion on different/better
		7535	* solutions is welcome!
		7536	*/
		7537	for_each_possible_cpu(i) {
		7538	struct dl_bw *dl_b = dl_bw_of(i);
		7539
		7540	raw_spin_lock(&dl_b->lock);
		7541	if (new_bw < dl_b->total_bw) {
		7542	raw_spin_unlock(&dl_b->lock);
		7543	return -EBUSY;
		7544	}
		7545	raw_spin_unlock(&dl_b->lock);
		7546	}
		7547
		7548	return 0;
		7549	}
		7550
7218	int sched_rr_handler(struct ctl_table *table, int write,	7551	int sched_rr_handler(struct ctl_table *table, int write,
7219	void __user buffer, size_t lenp,	7552	void __user buffer, size_t lenp,
7220	loff_t *ppos)	7553	loff_t *ppos)
@@ -7264,6 +7597,60 @@ int sched_rt_handler(struct ctl_table *table, int write,
7264	return ret;	7597	return ret;
7265	}	7598	}
7266		7599
		7600	int sched_dl_handler(struct ctl_table *table, int write,
		7601	void __user buffer, size_t lenp,
		7602	loff_t *ppos)
		7603	{
		7604	int ret;
		7605	int old_period, old_runtime;
		7606	static DEFINE_MUTEX(mutex);
		7607	unsigned long flags;
		7608
		7609	mutex_lock(&mutex);
		7610	old_period = sysctl_sched_dl_period;
		7611	old_runtime = sysctl_sched_dl_runtime;
		7612
		7613	ret = proc_dointvec(table, write, buffer, lenp, ppos);
		7614
		7615	if (!ret && write) {
		7616	raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock,
		7617	flags);
		7618
		7619	ret = sched_dl_global_constraints();
		7620	if (ret) {
		7621	sysctl_sched_dl_period = old_period;
		7622	sysctl_sched_dl_runtime = old_runtime;
		7623	} else {
		7624	u64 new_bw;
		7625	int i;
		7626
		7627	def_dl_bandwidth.dl_period = global_dl_period();
		7628	def_dl_bandwidth.dl_runtime = global_dl_runtime();
		7629	if (global_dl_runtime() == RUNTIME_INF)
		7630	new_bw = -1;
		7631	else
		7632	new_bw = to_ratio(global_dl_period(),
		7633	global_dl_runtime());
		7634	/*
		7635	* FIXME: As above...
		7636	*/
		7637	for_each_possible_cpu(i) {
		7638	struct dl_bw *dl_b = dl_bw_of(i);
		7639
		7640	raw_spin_lock(&dl_b->lock);
		7641	dl_b->bw = new_bw;
		7642	raw_spin_unlock(&dl_b->lock);
		7643	}
		7644	}
		7645
		7646	raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
		7647	flags);
		7648	}
		7649	mutex_unlock(&mutex);
		7650
		7651	return ret;
		7652	}
		7653
7267	#ifdef CONFIG_CGROUP_SCHED	7654	#ifdef CONFIG_CGROUP_SCHED
7268		7655
7269	static inline struct task_group css_tg(struct cgroup_subsys_state css)	7656	static inline struct task_group css_tg(struct cgroup_subsys_state css)