diff options
author | Juri Lelli <juri.lelli@arm.com> | 2014-09-19 05:22:40 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-10-28 05:47:58 -0400 |
commit | 7f51412a415d87ea8598d14722fb31e4f5701257 (patch) | |
tree | 1b3f90cb539185177143a1bf37e5f4f8d86b64bb | |
parent | d9aade7ae1d283097a3f626790e7c325a5c69007 (diff) |
sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets
Exclusive cpusets are the only way users can restrict SCHED_DEADLINE tasks
affinity (performing what is commonly called clustered scheduling).
Unfortunately, such thing is currently broken for two reasons:
- No check is performed when the user tries to attach a task to
an exlusive cpuset (recall that exclusive cpusets have an
associated maximum allowed bandwidth).
- Bandwidths of source and destination cpusets are not correctly
updated after a task is migrated between them.
This patch fixes both things at once, as they are opposite faces
of the same coin.
The check is performed in cpuset_can_attach(), as there aren't any
points of failure after that function. The updated is split in two
halves. We first reserve bandwidth in the destination cpuset, after
we pass the check in cpuset_can_attach(). And we then release
bandwidth from the source cpuset when the task's affinity is
actually changed. Even if there can be time windows when sched_setattr()
may erroneously fail in the source cpuset, we are fine with it, as
we can't perfom an atomic update of both cpusets at once.
Reported-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Reported-by: Vincent Legout <vincent@legout.info>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dario Faggioli <raistlin@linux.it>
Cc: Michael Trimarchi <michael@amarulasolutions.com>
Cc: Fabio Checconi <fchecconi@gmail.com>
Cc: michael@amarulasolutions.com
Cc: luca.abeni@unitn.it
Cc: Li Zefan <lizefan@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: cgroups@vger.kernel.org
Link: http://lkml.kernel.org/r/1411118561-26323-3-git-send-email-juri.lelli@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 70 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 25 | ||||
-rw-r--r-- | kernel/sched/sched.h | 19 |
5 files changed, 97 insertions, 32 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 5e344bbe63ec..1d1fa081d44f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -2052,6 +2052,8 @@ static inline void tsk_restore_flags(struct task_struct *task, | |||
2052 | task->flags |= orig_flags & flags; | 2052 | task->flags |= orig_flags & flags; |
2053 | } | 2053 | } |
2054 | 2054 | ||
2055 | extern int task_can_attach(struct task_struct *p, | ||
2056 | const struct cpumask *cs_cpus_allowed); | ||
2055 | #ifdef CONFIG_SMP | 2057 | #ifdef CONFIG_SMP |
2056 | extern void do_set_cpus_allowed(struct task_struct *p, | 2058 | extern void do_set_cpus_allowed(struct task_struct *p, |
2057 | const struct cpumask *new_mask); | 2059 | const struct cpumask *new_mask); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087b..7af8577fc8f8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1429,17 +1429,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, | |||
1429 | goto out_unlock; | 1429 | goto out_unlock; |
1430 | 1430 | ||
1431 | cgroup_taskset_for_each(task, tset) { | 1431 | cgroup_taskset_for_each(task, tset) { |
1432 | /* | 1432 | ret = task_can_attach(task, cs->cpus_allowed); |
1433 | * Kthreads which disallow setaffinity shouldn't be moved | 1433 | if (ret) |
1434 | * to a new cpuset; we don't want to change their cpu | ||
1435 | * affinity and isolating such threads by their set of | ||
1436 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
1437 | * applicable for such threads. This prevents checking for | ||
1438 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
1439 | * before cpus_allowed may be changed. | ||
1440 | */ | ||
1441 | ret = -EINVAL; | ||
1442 | if (task->flags & PF_NO_SETAFFINITY) | ||
1443 | goto out_unlock; | 1434 | goto out_unlock; |
1444 | ret = security_task_setscheduler(task); | 1435 | ret = security_task_setscheduler(task); |
1445 | if (ret) | 1436 | if (ret) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c067fd66db9..9993feeb8b10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2034,25 +2034,6 @@ static inline int dl_bw_cpus(int i) | |||
2034 | } | 2034 | } |
2035 | #endif | 2035 | #endif |
2036 | 2036 | ||
2037 | static inline | ||
2038 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
2039 | { | ||
2040 | dl_b->total_bw -= tsk_bw; | ||
2041 | } | ||
2042 | |||
2043 | static inline | ||
2044 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
2045 | { | ||
2046 | dl_b->total_bw += tsk_bw; | ||
2047 | } | ||
2048 | |||
2049 | static inline | ||
2050 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
2051 | { | ||
2052 | return dl_b->bw != -1 && | ||
2053 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
2054 | } | ||
2055 | |||
2056 | /* | 2037 | /* |
2057 | * We must be sure that accepting a new task (or allowing changing the | 2038 | * We must be sure that accepting a new task (or allowing changing the |
2058 | * parameters of an existing one) is consistent with the bandwidth | 2039 | * parameters of an existing one) is consistent with the bandwidth |
@@ -4669,6 +4650,57 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4669 | #endif | 4650 | #endif |
4670 | } | 4651 | } |
4671 | 4652 | ||
4653 | int task_can_attach(struct task_struct *p, | ||
4654 | const struct cpumask *cs_cpus_allowed) | ||
4655 | { | ||
4656 | int ret = 0; | ||
4657 | |||
4658 | /* | ||
4659 | * Kthreads which disallow setaffinity shouldn't be moved | ||
4660 | * to a new cpuset; we don't want to change their cpu | ||
4661 | * affinity and isolating such threads by their set of | ||
4662 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
4663 | * applicable for such threads. This prevents checking for | ||
4664 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
4665 | * before cpus_allowed may be changed. | ||
4666 | */ | ||
4667 | if (p->flags & PF_NO_SETAFFINITY) { | ||
4668 | ret = -EINVAL; | ||
4669 | goto out; | ||
4670 | } | ||
4671 | |||
4672 | #ifdef CONFIG_SMP | ||
4673 | if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, | ||
4674 | cs_cpus_allowed)) { | ||
4675 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | ||
4676 | cs_cpus_allowed); | ||
4677 | struct dl_bw *dl_b = dl_bw_of(dest_cpu); | ||
4678 | bool overflow; | ||
4679 | int cpus; | ||
4680 | unsigned long flags; | ||
4681 | |||
4682 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
4683 | cpus = dl_bw_cpus(dest_cpu); | ||
4684 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | ||
4685 | if (overflow) | ||
4686 | ret = -EBUSY; | ||
4687 | else { | ||
4688 | /* | ||
4689 | * We reserve space for this task in the destination | ||
4690 | * root_domain, as we can't fail after this point. | ||
4691 | * We will free resources in the source root_domain | ||
4692 | * later on (see set_cpus_allowed_dl()). | ||
4693 | */ | ||
4694 | __dl_add(dl_b, p->dl.dl_bw); | ||
4695 | } | ||
4696 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
4697 | |||
4698 | } | ||
4699 | #endif | ||
4700 | out: | ||
4701 | return ret; | ||
4702 | } | ||
4703 | |||
4672 | #ifdef CONFIG_SMP | 4704 | #ifdef CONFIG_SMP |
4673 | /* | 4705 | /* |
4674 | * move_queued_task - move a queued task to new rq. | 4706 | * move_queued_task - move a queued task to new rq. |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d1e76a21297..8aaa971ffecd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -1517,10 +1517,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
1517 | const struct cpumask *new_mask) | 1517 | const struct cpumask *new_mask) |
1518 | { | 1518 | { |
1519 | struct rq *rq; | 1519 | struct rq *rq; |
1520 | struct root_domain *src_rd; | ||
1520 | int weight; | 1521 | int weight; |
1521 | 1522 | ||
1522 | BUG_ON(!dl_task(p)); | 1523 | BUG_ON(!dl_task(p)); |
1523 | 1524 | ||
1525 | rq = task_rq(p); | ||
1526 | src_rd = rq->rd; | ||
1527 | /* | ||
1528 | * Migrating a SCHED_DEADLINE task between exclusive | ||
1529 | * cpusets (different root_domains) entails a bandwidth | ||
1530 | * update. We already made space for us in the destination | ||
1531 | * domain (see cpuset_can_attach()). | ||
1532 | */ | ||
1533 | if (!cpumask_intersects(src_rd->span, new_mask)) { | ||
1534 | struct dl_bw *src_dl_b; | ||
1535 | |||
1536 | src_dl_b = dl_bw_of(cpu_of(rq)); | ||
1537 | /* | ||
1538 | * We now free resources of the root_domain we are migrating | ||
1539 | * off. In the worst case, sched_setattr() may temporary fail | ||
1540 | * until we complete the update. | ||
1541 | */ | ||
1542 | raw_spin_lock(&src_dl_b->lock); | ||
1543 | __dl_clear(src_dl_b, p->dl.dl_bw); | ||
1544 | raw_spin_unlock(&src_dl_b->lock); | ||
1545 | } | ||
1546 | |||
1524 | /* | 1547 | /* |
1525 | * Update only if the task is actually running (i.e., | 1548 | * Update only if the task is actually running (i.e., |
1526 | * it is on the rq AND it is not throttled). | 1549 | * it is on the rq AND it is not throttled). |
@@ -1537,8 +1560,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
1537 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | 1560 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
1538 | return; | 1561 | return; |
1539 | 1562 | ||
1540 | rq = task_rq(p); | ||
1541 | |||
1542 | /* | 1563 | /* |
1543 | * The process used to be able to migrate OR it can now migrate | 1564 | * The process used to be able to migrate OR it can now migrate |
1544 | */ | 1565 | */ |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 57aacea1cbdf..ec3917c5f898 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -176,6 +176,25 @@ struct dl_bw { | |||
176 | u64 bw, total_bw; | 176 | u64 bw, total_bw; |
177 | }; | 177 | }; |
178 | 178 | ||
179 | static inline | ||
180 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
181 | { | ||
182 | dl_b->total_bw -= tsk_bw; | ||
183 | } | ||
184 | |||
185 | static inline | ||
186 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
187 | { | ||
188 | dl_b->total_bw += tsk_bw; | ||
189 | } | ||
190 | |||
191 | static inline | ||
192 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
193 | { | ||
194 | return dl_b->bw != -1 && | ||
195 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
196 | } | ||
197 | |||
179 | extern struct mutex sched_domains_mutex; | 198 | extern struct mutex sched_domains_mutex; |
180 | 199 | ||
181 | #ifdef CONFIG_CGROUP_SCHED | 200 | #ifdef CONFIG_CGROUP_SCHED |