sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets

Exclusive cpusets are the only way users can restrict SCHED_DEADLINE tasks affinity (performing what is commonly called clustered scheduling). Unfortunately, such thing is currently broken for two reasons: - No check is performed when the user tries to attach a task to an exlusive cpuset (recall that exclusive cpusets have an associated maximum allowed bandwidth). - Bandwidths of source and destination cpusets are not correctly updated after a task is migrated between them. This patch fixes both things at once, as they are opposite faces of the same coin. The check is performed in cpuset_can_attach(), as there aren't any points of failure after that function. The updated is split in two halves. We first reserve bandwidth in the destination cpuset, after we pass the check in cpuset_can_attach(). And we then release bandwidth from the source cpuset when the task's affinity is actually changed. Even if there can be time windows when sched_setattr() may erroneously fail in the source cpuset, we are fine with it, as we can't perfom an atomic update of both cpusets at once. Reported-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Reported-by: Vincent Legout <vincent@legout.info> Signed-off-by: Juri Lelli <juri.lelli@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Dario Faggioli <raistlin@linux.it> Cc: Michael Trimarchi <michael@amarulasolutions.com> Cc: Fabio Checconi <fchecconi@gmail.com> Cc: michael@amarulasolutions.com Cc: luca.abeni@unitn.it Cc: Li Zefan <lizefan@huawei.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: cgroups@vger.kernel.org Link: http://lkml.kernel.org/r/1411118561-26323-3-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Juri Lelli <juri.lelli@arm.com> 2014-09-19 05:22:40 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-10-28 05:47:58 -0400
commit: 7f51412a415d87ea8598d14722fb31e4f5701257 (patch)
tree: 1b3f90cb539185177143a1bf37e5f4f8d86b64bb /kernel/sched
parent: d9aade7ae1d283097a3f626790e7c325a5c69007 (diff)
3 files changed, 93 insertions, 21 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c067fd66db9..9993feeb8b10 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2034,25 +2034,6 @@ static inline int dl_bw_cpus(int i)
 }
 #endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
-        dl_b->total_bw -= tsk_bw;
-}
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
-        dl_b->total_bw += tsk_bw;
-}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
-        return dl_b->bw != -1 &&
-               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
 /*
 * We must be sure that accepting a new task (or allowing changing the
 * parameters of an existing one) is consistent with the bandwidth
@@ -4669,6 +4650,57 @@ void init_idle(struct task_struct *idle, int cpu)
 #endif
 }
+int task_can_attach(struct task_struct *p,
+                    const struct cpumask *cs_cpus_allowed)
+{
+        int ret = 0;
+        /*
+         * Kthreads which disallow setaffinity shouldn't be moved
+         * to a new cpuset; we don't want to change their cpu
+         * affinity and isolating such threads by their set of
+         * allowed nodes is unnecessary.  Thus, cpusets are not
+         * applicable for such threads.  This prevents checking for
+         * success of set_cpus_allowed_ptr() on all attached tasks
+         * before cpus_allowed may be changed.
+         */
+        if (p->flags & PF_NO_SETAFFINITY) {
+                ret = -EINVAL;
+                goto out;
+        }
+#ifdef CONFIG_SMP
+        if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+                                              cs_cpus_allowed)) {
+                unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+                                                        cs_cpus_allowed);
+                struct dl_bw *dl_b = dl_bw_of(dest_cpu);
+                bool overflow;
+                int cpus;
+                unsigned long flags;
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
+                cpus = dl_bw_cpus(dest_cpu);
+                overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+                if (overflow)
+                        ret = -EBUSY;
+                else {
+                        /*
+                         * We reserve space for this task in the destination
+                         * root_domain, as we can't fail after this point.
+                         * We will free resources in the source root_domain
+                         * later on (see set_cpus_allowed_dl()).
+                         */
+                        __dl_add(dl_b, p->dl.dl_bw);
+                }
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+        }
+#endif
+out:
+        return ret;
+}
 #ifdef CONFIG_SMP
 /*
 * move_queued_task - move a queued task to new rq.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9d1e76a21297..8aaa971ffecd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1517,10 +1517,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
        struct rq *rq;
+        struct root_domain *src_rd;
        int weight;
        BUG_ON(!dl_task(p));
+        rq = task_rq(p);
+        src_rd = rq->rd;
+        /*
+         * Migrating a SCHED_DEADLINE task between exclusive
+         * cpusets (different root_domains) entails a bandwidth
+         * update. We already made space for us in the destination
+         * domain (see cpuset_can_attach()).
+         */
+        if (!cpumask_intersects(src_rd->span, new_mask)) {
+                struct dl_bw *src_dl_b;
+                src_dl_b = dl_bw_of(cpu_of(rq));
+                /*
+                 * We now free resources of the root_domain we are migrating
+                 * off. In the worst case, sched_setattr() may temporary fail
+                 * until we complete the update.
+                 */
+                raw_spin_lock(&src_dl_b->lock);
+                __dl_clear(src_dl_b, p->dl.dl_bw);
+                raw_spin_unlock(&src_dl_b->lock);
+        }
        /*
         * Update only if the task is actually running (i.e.,
         * it is on the rq AND it is not throttled).
@@ -1537,8 +1560,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
        if ((p->nr_cpus_allowed > 1) == (weight > 1))
                return;
-        rq = task_rq(p);
        /*
         * The process used to be able to migrate OR it can now migrate
         */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 57aacea1cbdf..ec3917c5f898 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
        u64 bw, total_bw;
 };
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw -= tsk_bw;
+}
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw += tsk_bw;
+}
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+        return dl_b->bw != -1 &&
+               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
 extern struct mutex sched_domains_mutex;
 #ifdef CONFIG_CGROUP_SCHED
author	Juri Lelli <juri.lelli@arm.com>	2014-09-19 05:22:40 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-10-28 05:47:58 -0400
commit	7f51412a415d87ea8598d14722fb31e4f5701257 (patch)
tree	1b3f90cb539185177143a1bf37e5f4f8d86b64bb /kernel/sched
parent	d9aade7ae1d283097a3f626790e7c325a5c69007 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c067fd66db9..9993feeb8b10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -2034,25 +2034,6 @@ static inline int dl_bw_cpus(int i)
2034	}	2034	}
2035	#endif	2035	#endif
2036		2036
2037	static inline
2038	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039	{
2040	dl_b->total_bw -= tsk_bw;
2041	}
2042
2043	static inline
2044	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045	{
2046	dl_b->total_bw += tsk_bw;
2047	}
2048
2049	static inline
2050	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051	{
2052	return dl_b->bw != -1 &&
2053	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054	}
2055
2056	/*	2037	/*
2057	* We must be sure that accepting a new task (or allowing changing the	2038	* We must be sure that accepting a new task (or allowing changing the
2058	* parameters of an existing one) is consistent with the bandwidth	2039	* parameters of an existing one) is consistent with the bandwidth
@@ -4669,6 +4650,57 @@ void init_idle(struct task_struct *idle, int cpu)
4669	#endif	4650	#endif
4670	}	4651	}
4671		4652
		4653	int task_can_attach(struct task_struct *p,
		4654	const struct cpumask *cs_cpus_allowed)
		4655	{
		4656	int ret = 0;
		4657
		4658	/*
		4659	* Kthreads which disallow setaffinity shouldn't be moved
		4660	* to a new cpuset; we don't want to change their cpu
		4661	* affinity and isolating such threads by their set of
		4662	* allowed nodes is unnecessary. Thus, cpusets are not
		4663	* applicable for such threads. This prevents checking for
		4664	* success of set_cpus_allowed_ptr() on all attached tasks
		4665	* before cpus_allowed may be changed.
		4666	*/
		4667	if (p->flags & PF_NO_SETAFFINITY) {
		4668	ret = -EINVAL;
		4669	goto out;
		4670	}
		4671
		4672	#ifdef CONFIG_SMP
		4673	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
		4674	cs_cpus_allowed)) {
		4675	unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
		4676	cs_cpus_allowed);
		4677	struct dl_bw *dl_b = dl_bw_of(dest_cpu);
		4678	bool overflow;
		4679	int cpus;
		4680	unsigned long flags;
		4681
		4682	raw_spin_lock_irqsave(&dl_b->lock, flags);
		4683	cpus = dl_bw_cpus(dest_cpu);
		4684	overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
		4685	if (overflow)
		4686	ret = -EBUSY;
		4687	else {
		4688	/*
		4689	* We reserve space for this task in the destination
		4690	* root_domain, as we can't fail after this point.
		4691	* We will free resources in the source root_domain
		4692	* later on (see set_cpus_allowed_dl()).
		4693	*/
		4694	__dl_add(dl_b, p->dl.dl_bw);
		4695	}
		4696	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
		4697
		4698	}
		4699	#endif
		4700	out:
		4701	return ret;
		4702	}
		4703
4672	#ifdef CONFIG_SMP	4704	#ifdef CONFIG_SMP
4673	/*	4705	/*
4674	* move_queued_task - move a queued task to new rq.	4706	* move_queued_task - move a queued task to new rq.


diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d1e76a21297..8aaa971ffecd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c
@@ -1517,10 +1517,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1517	const struct cpumask *new_mask)	1517	const struct cpumask *new_mask)
1518	{	1518	{
1519	struct rq *rq;	1519	struct rq *rq;
		1520	struct root_domain *src_rd;
1520	int weight;	1521	int weight;
1521		1522
1522	BUG_ON(!dl_task(p));	1523	BUG_ON(!dl_task(p));
1523		1524
		1525	rq = task_rq(p);
		1526	src_rd = rq->rd;
		1527	/*
		1528	* Migrating a SCHED_DEADLINE task between exclusive
		1529	* cpusets (different root_domains) entails a bandwidth
		1530	* update. We already made space for us in the destination
		1531	* domain (see cpuset_can_attach()).
		1532	*/
		1533	if (!cpumask_intersects(src_rd->span, new_mask)) {
		1534	struct dl_bw *src_dl_b;
		1535
		1536	src_dl_b = dl_bw_of(cpu_of(rq));
		1537	/*
		1538	* We now free resources of the root_domain we are migrating
		1539	* off. In the worst case, sched_setattr() may temporary fail
		1540	* until we complete the update.
		1541	*/
		1542	raw_spin_lock(&src_dl_b->lock);
		1543	__dl_clear(src_dl_b, p->dl.dl_bw);
		1544	raw_spin_unlock(&src_dl_b->lock);
		1545	}
		1546
1524	/*	1547	/*
1525	* Update only if the task is actually running (i.e.,	1548	* Update only if the task is actually running (i.e.,
1526	* it is on the rq AND it is not throttled).	1549	* it is on the rq AND it is not throttled).
@@ -1537,8 +1560,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1537	if ((p->nr_cpus_allowed > 1) == (weight > 1))	1560	if ((p->nr_cpus_allowed > 1) == (weight > 1))
1538	return;	1561	return;
1539		1562
1540	rq = task_rq(p);
1541
1542	/*	1563	/*
1543	* The process used to be able to migrate OR it can now migrate	1564	* The process used to be able to migrate OR it can now migrate
1544	*/	1565	*/


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 57aacea1cbdf..ec3917c5f898 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
176	u64 bw, total_bw;	176	u64 bw, total_bw;
177	};	177	};
178		178
		179	static inline
		180	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
		181	{
		182	dl_b->total_bw -= tsk_bw;
		183	}
		184
		185	static inline
		186	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
		187	{
		188	dl_b->total_bw += tsk_bw;
		189	}
		190
		191	static inline
		192	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
		193	{
		194	return dl_b->bw != -1 &&
		195	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
		196	}
		197
179	extern struct mutex sched_domains_mutex;	198	extern struct mutex sched_domains_mutex;
180		199
181	#ifdef CONFIG_CGROUP_SCHED	200	#ifdef CONFIG_CGROUP_SCHED