sched/deadline: speed up SCHED_DEADLINE pushes with a push-heap

Data from tests confirmed that the original active load balancing logic didn't scale neither in the number of CPU nor in the number of tasks (as sched_rt does). Here we provide a global data structure to keep track of deadlines of the running tasks in the system. The structure is composed by a bitmask showing the free CPUs and a max-heap, needed when the system is heavily loaded. The implementation and concurrent access scheme are kept simple by design. However, our measurements show that we can compete with sched_rt on large multi-CPUs machines [1]. Only the push path is addressed, the extension to use this structure also for pull decisions is straightforward. However, we are currently evaluating different (in order to decrease/avoid contention) data structures to solve possibly both problems. We are also going to re-run tests considering recent changes inside cpupri [2]. [1] http://retis.sssup.it/~jlelli/papers/Ospert11Lelli.pdf [2] http://www.spinics.net/lists/linux-rt-users/msg06778.html Signed-off-by: Juri Lelli <juri.lelli@gmail.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1383831828-15501-14-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Juri Lelli <juri.lelli@gmail.com> 2013-11-07 08:43:47 -0500
committer: Ingo Molnar <mingo@kernel.org> 2014-01-13 07:46:46 -0500
commit: 6bfd6d72f51c51177676f2b1ba113fe0a85fdae4 (patch)
tree: 8c3c4c49f18ba3218da4274623b50da0a317f2d6
parent: 332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 (diff)
6 files changed, 269 insertions, 40 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index b039035a9376..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -14,7 +14,7 @@ endif
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o
-obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c7c68e6b5c51..e30356d6b31f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5287,6 +5287,7 @@ static void free_rootdomain(struct rcu_head *rcu)
        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
        cpupri_cleanup(&rd->cpupri);
+        cpudl_cleanup(&rd->cpudl);
        free_cpumask_var(rd->dlo_mask);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
@@ -5345,6 +5346,8 @@ static int init_rootdomain(struct root_domain *rd)
                goto free_dlo_mask;
        init_dl_bw(&rd->dl_bw);
+        if (cpudl_init(&rd->cpudl) != 0)
+                goto free_dlo_mask;
        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..3bcade554343
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
+/*
+ *  kernel/sched/cpudl.c
+ *
+ *  Global CPU deadline management
+ *
+ *  Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include "cpudeadline.h"
+static inline int parent(int i)
+{
+        return (i - 1) >> 1;
+}
+static inline int left_child(int i)
+{
+        return (i << 1) + 1;
+}
+static inline int right_child(int i)
+{
+        return (i << 1) + 2;
+}
+static inline int dl_time_before(u64 a, u64 b)
+{
+        return (s64)(a - b) < 0;
+}
+void cpudl_exchange(struct cpudl *cp, int a, int b)
+{
+        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+        swap(cp->elements[a], cp->elements[b]);
+        swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+}
+void cpudl_heapify(struct cpudl *cp, int idx)
+{
+        int l, r, largest;
+        /* adapted from lib/prio_heap.c */
+        while(1) {
+                l = left_child(idx);
+                r = right_child(idx);
+                largest = idx;
+                if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+                                                        cp->elements[l].dl))
+                        largest = l;
+                if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+                                                        cp->elements[r].dl))
+                        largest = r;
+                if (largest == idx)
+                        break;
+                /* Push idx down the heap one level and bump one up */
+                cpudl_exchange(cp, largest, idx);
+                idx = largest;
+        }
+}
+void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+{
+        WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+                cp->elements[idx].dl = new_dl;
+                cpudl_heapify(cp, idx);
+        } else {
+                cp->elements[idx].dl = new_dl;
+                while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                                        cp->elements[idx].dl)) {
+                        cpudl_exchange(cp, idx, parent(idx));
+                        idx = parent(idx);
+                }
+        }
+}
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+        return cp->elements[0].cpu;
+}
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - best CPU (heap maximum if suitable)
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+               struct cpumask *later_mask)
+{
+        int best_cpu = -1;
+        const struct sched_dl_entity *dl_se = &p->dl;
+        if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+                        &p->cpus_allowed) && cpumask_and(later_mask,
+                        later_mask, cpu_active_mask)) {
+                best_cpu = cpumask_any(later_mask);
+                goto out;
+        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+                        dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+                best_cpu = cpudl_maximum(cp);
+                if (later_mask)
+                        cpumask_set_cpu(best_cpu, later_mask);
+        }
+out:
+        WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+        return best_cpu;
+}
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+{
+        int old_idx, new_cpu;
+        unsigned long flags;
+        WARN_ON(cpu > num_present_cpus());
+        raw_spin_lock_irqsave(&cp->lock, flags);
+        old_idx = cp->cpu_to_idx[cpu];
+        if (!is_valid) {
+                /* remove item */
+                if (old_idx == IDX_INVALID) {
+                        /*
+                         * Nothing to remove if old_idx was invalid.
+                         * This could happen if a rq_offline_dl is
+                         * called for a CPU without -dl tasks running.
+                         */
+                        goto out;
+                }
+                new_cpu = cp->elements[cp->size - 1].cpu;
+                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+                cp->elements[old_idx].cpu = new_cpu;
+                cp->size--;
+                cp->cpu_to_idx[new_cpu] = old_idx;
+                cp->cpu_to_idx[cpu] = IDX_INVALID;
+                while (old_idx > 0 && dl_time_before(
+                                cp->elements[parent(old_idx)].dl,
+                                cp->elements[old_idx].dl)) {
+                        cpudl_exchange(cp, old_idx, parent(old_idx));
+                        old_idx = parent(old_idx);
+                }
+                cpumask_set_cpu(cpu, cp->free_cpus);
+                cpudl_heapify(cp, old_idx);
+                goto out;
+        }
+        if (old_idx == IDX_INVALID) {
+                cp->size++;
+                cp->elements[cp->size - 1].dl = 0;
+                cp->elements[cp->size - 1].cpu = cpu;
+                cp->cpu_to_idx[cpu] = cp->size - 1;
+                cpudl_change_key(cp, cp->size - 1, dl);
+                cpumask_clear_cpu(cpu, cp->free_cpus);
+        } else {
+                cpudl_change_key(cp, old_idx, dl);
+        }
+out:
+        raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+        int i;
+        memset(cp, 0, sizeof(*cp));
+        raw_spin_lock_init(&cp->lock);
+        cp->size = 0;
+        for (i = 0; i < NR_CPUS; i++)
+                cp->cpu_to_idx[i] = IDX_INVALID;
+        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+                return -ENOMEM;
+        cpumask_setall(cp->free_cpus);
+        return 0;
+}
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+        /*
+         * nothing to do for the moment
+         */
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_CPUDL_H
+#define _LINUX_CPUDL_H
+#include <linux/sched.h>
+#define IDX_INVALID     -1
+struct array_item {
+        u64 dl;
+        int cpu;
+};
+struct cpudl {
+        raw_spinlock_t lock;
+        int size;
+        int cpu_to_idx[NR_CPUS];
+        struct array_item elements[NR_CPUS];
+        cpumask_var_t free_cpus;
+};
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+               struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+int cpudl_init(struct cpudl *cp);
+void cpudl_cleanup(struct cpudl *cp);
+#else
+#define cpudl_set(cp, cpu, dl) do { } while (0)
+#define cpudl_init() do { } while (0)
+#endif /* CONFIG_SMP */
+#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 802188fb6338..0c6b1d089cd4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
 */
 #include "sched.h"
+#include <linux/slab.h>
 struct dl_bandwidth def_dl_bandwidth;
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -640,6 +642,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
                 */
                dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
                dl_rq->earliest_dl.curr = deadline;
+                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
        } else if (dl_rq->earliest_dl.next == 0 ||
                   dl_time_before(deadline, dl_rq->earliest_dl.next)) {
                /*
@@ -663,6 +666,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (!dl_rq->dl_nr_running) {
                dl_rq->earliest_dl.curr = 0;
                dl_rq->earliest_dl.next = 0;
+                cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
        } else {
                struct rb_node *leftmost = dl_rq->rb_leftmost;
                struct sched_dl_entity *entry;
@@ -670,6 +674,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
                entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
                dl_rq->earliest_dl.curr = entry->deadline;
                dl_rq->earliest_dl.next = next_deadline(rq);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
        }
 }
@@ -855,9 +860,6 @@ static void yield_task_dl(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_later_rq(struct task_struct *task);
-static int latest_cpu_find(struct cpumask *span,
-                           struct task_struct *task,
-                           struct cpumask *later_mask);
 static int
 select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
@@ -904,7 +906,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
         * let's hope p can move out.
         */
        if (rq->curr->nr_cpus_allowed == 1 ||
-            latest_cpu_find(rq->rd->span, rq->curr, NULL) == -1)
+            cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
                return;
        /*
@@ -912,7 +914,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
         * see if it is pushed or pulled somewhere else.
         */
        if (p->nr_cpus_allowed != 1 &&
-            latest_cpu_find(rq->rd->span, p, NULL) != -1)
+            cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
                return;
        resched_task(rq->curr);
@@ -1085,39 +1087,6 @@ next_node:
        return NULL;
 }
-static int latest_cpu_find(struct cpumask *span,
-                           struct task_struct *task,
-                           struct cpumask *later_mask)
-{
-        const struct sched_dl_entity *dl_se = &task->dl;
-        int cpu, found = -1, best = 0;
-        u64 max_dl = 0;
-        for_each_cpu(cpu, span) {
-                struct rq *rq = cpu_rq(cpu);
-                struct dl_rq *dl_rq = &rq->dl;
-                if (cpumask_test_cpu(cpu, &task->cpus_allowed) &&
-                    (!dl_rq->dl_nr_running || dl_time_before(dl_se->deadline,
-                     dl_rq->earliest_dl.curr))) {
-                        if (later_mask)
-                                cpumask_set_cpu(cpu, later_mask);
-                        if (!best && !dl_rq->dl_nr_running) {
-                                best = 1;
-                                found = cpu;
-                        } else if (!best &&
-                                   dl_time_before(max_dl,
-                                                  dl_rq->earliest_dl.curr)) {
-                                max_dl = dl_rq->earliest_dl.curr;
-                                found = cpu;
-                        }
-                } else if (later_mask)
-                        cpumask_clear_cpu(cpu, later_mask);
-        }
-        return found;
-}
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
 static int find_later_rq(struct task_struct *task)
@@ -1134,7 +1103,8 @@ static int find_later_rq(struct task_struct *task)
        if (task->nr_cpus_allowed == 1)
                return -1;
-        best_cpu = latest_cpu_find(task_rq(task)->rd->span, task, later_mask);
+        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
+                        task, later_mask);
        if (best_cpu == -1)
                return -1;
@@ -1510,6 +1480,9 @@ static void rq_online_dl(struct rq *rq)
 {
        if (rq->dl.overloaded)
                dl_set_overload(rq);
+        if (rq->dl.dl_nr_running > 0)
+                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
 }
 /* Assumes rq->lock is held */
@@ -1517,6 +1490,8 @@ static void rq_offline_dl(struct rq *rq)
 {
        if (rq->dl.overloaded)
                dl_clear_overload(rq);
+        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
 }
 void init_sched_dl_class(void)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ad4f4fbd002e..2b7421db6c41 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include "cpupri.h"
+#include "cpudeadline.h"
 #include "cpuacct.h"
 struct rq;
@@ -503,6 +504,7 @@ struct root_domain {
        cpumask_var_t dlo_mask;
        atomic_t dlo_count;
        struct dl_bw dl_bw;
+        struct cpudl cpudl;
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
author	Juri Lelli <juri.lelli@gmail.com>	2013-11-07 08:43:47 -0500
committer	Ingo Molnar <mingo@kernel.org>	2014-01-13 07:46:46 -0500
commit	6bfd6d72f51c51177676f2b1ba113fe0a85fdae4 (patch)
tree	8c3c4c49f18ba3218da4274623b50da0a317f2d6
parent	332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 (diff)

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index b039035a9376..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile
@@ -14,7 +14,7 @@ endif
14	obj-y += core.o proc.o clock.o cputime.o	14	obj-y += core.o proc.o clock.o cputime.o
15	obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o	15	obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16	obj-y += wait.o completion.o	16	obj-y += wait.o completion.o
17	obj-$(CONFIG_SMP) += cpupri.o	17	obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18	obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o	18	obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19	obj-$(CONFIG_SCHEDSTATS) += stats.o	19	obj-$(CONFIG_SCHEDSTATS) += stats.o
20	obj-$(CONFIG_SCHED_DEBUG) += debug.o	20	obj-$(CONFIG_SCHED_DEBUG) += debug.o


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c7c68e6b5c51..e30356d6b31f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5287,6 +5287,7 @@ static void free_rootdomain(struct rcu_head *rcu)
5287	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);	5287	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5288		5288
5289	cpupri_cleanup(&rd->cpupri);	5289	cpupri_cleanup(&rd->cpupri);
		5290	cpudl_cleanup(&rd->cpudl);
5290	free_cpumask_var(rd->dlo_mask);	5291	free_cpumask_var(rd->dlo_mask);
5291	free_cpumask_var(rd->rto_mask);	5292	free_cpumask_var(rd->rto_mask);
5292	free_cpumask_var(rd->online);	5293	free_cpumask_var(rd->online);
@@ -5345,6 +5346,8 @@ static int init_rootdomain(struct root_domain *rd)
5345	goto free_dlo_mask;	5346	goto free_dlo_mask;
5346		5347
5347	init_dl_bw(&rd->dl_bw);	5348	init_dl_bw(&rd->dl_bw);
		5349	if (cpudl_init(&rd->cpudl) != 0)
		5350	goto free_dlo_mask;
5348		5351
5349	if (cpupri_init(&rd->cpupri) != 0)	5352	if (cpupri_init(&rd->cpupri) != 0)
5350	goto free_rto_mask;	5353	goto free_rto_mask;


diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..3bcade554343 --- /dev/null +++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
		1	/*
		2	* kernel/sched/cpudl.c
		3	*
		4	* Global CPU deadline management
		5	*
		6	* Author: Juri Lelli <j.lelli@sssup.it>
		7	*
		8	* This program is free software; you can redistribute it and/or
		9	* modify it under the terms of the GNU General Public License
		10	* as published by the Free Software Foundation; version 2
		11	* of the License.
		12	*/
		13
		14	#include <linux/gfp.h>
		15	#include <linux/kernel.h>
		16	#include "cpudeadline.h"
		17
		18	static inline int parent(int i)
		19	{
		20	return (i - 1) >> 1;
		21	}
		22
		23	static inline int left_child(int i)
		24	{
		25	return (i << 1) + 1;
		26	}
		27
		28	static inline int right_child(int i)
		29	{
		30	return (i << 1) + 2;
		31	}
		32
		33	static inline int dl_time_before(u64 a, u64 b)
		34	{
		35	return (s64)(a - b) < 0;
		36	}
		37
		38	void cpudl_exchange(struct cpudl *cp, int a, int b)
		39	{
		40	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
		41
		42	swap(cp->elements[a], cp->elements[b]);
		43	swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
		44	}
		45
		46	void cpudl_heapify(struct cpudl *cp, int idx)
		47	{
		48	int l, r, largest;
		49
		50	/* adapted from lib/prio_heap.c */
		51	while(1) {
		52	l = left_child(idx);
		53	r = right_child(idx);
		54	largest = idx;
		55
		56	if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
		57	cp->elements[l].dl))
		58	largest = l;
		59	if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
		60	cp->elements[r].dl))
		61	largest = r;
		62	if (largest == idx)
		63	break;
		64
		65	/* Push idx down the heap one level and bump one up */
		66	cpudl_exchange(cp, largest, idx);
		67	idx = largest;
		68	}
		69	}
		70
		71	void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
		72	{
		73	WARN_ON(idx > num_present_cpus() \|\| idx == IDX_INVALID);
		74
		75	if (dl_time_before(new_dl, cp->elements[idx].dl)) {
		76	cp->elements[idx].dl = new_dl;
		77	cpudl_heapify(cp, idx);
		78	} else {
		79	cp->elements[idx].dl = new_dl;
		80	while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
		81	cp->elements[idx].dl)) {
		82	cpudl_exchange(cp, idx, parent(idx));
		83	idx = parent(idx);
		84	}
		85	}
		86	}
		87
		88	static inline int cpudl_maximum(struct cpudl *cp)
		89	{
		90	return cp->elements[0].cpu;
		91	}
		92
		93	/*
		94	* cpudl_find - find the best (later-dl) CPU in the system
		95	* @cp: the cpudl max-heap context
		96	* @p: the task
		97	* @later_mask: a mask to fill in with the selected CPUs (or NULL)
		98	*
		99	* Returns: int - best CPU (heap maximum if suitable)
		100	*/
		101	int cpudl_find(struct cpudl cp, struct task_struct p,
		102	struct cpumask *later_mask)
		103	{
		104	int best_cpu = -1;
		105	const struct sched_dl_entity *dl_se = &p->dl;
		106
		107	if (later_mask && cpumask_and(later_mask, cp->free_cpus,
		108	&p->cpus_allowed) && cpumask_and(later_mask,
		109	later_mask, cpu_active_mask)) {
		110	best_cpu = cpumask_any(later_mask);
		111	goto out;
		112	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
		113	dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
		114	best_cpu = cpudl_maximum(cp);
		115	if (later_mask)
		116	cpumask_set_cpu(best_cpu, later_mask);
		117	}
		118
		119	out:
		120	WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
		121
		122	return best_cpu;
		123	}
		124
		125	/*
		126	* cpudl_set - update the cpudl max-heap
		127	* @cp: the cpudl max-heap context
		128	* @cpu: the target cpu
		129	* @dl: the new earliest deadline for this cpu
		130	*
		131	* Notes: assumes cpu_rq(cpu)->lock is locked
		132	*
		133	* Returns: (void)
		134	*/
		135	void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
		136	{
		137	int old_idx, new_cpu;
		138	unsigned long flags;
		139
		140	WARN_ON(cpu > num_present_cpus());
		141
		142	raw_spin_lock_irqsave(&cp->lock, flags);
		143	old_idx = cp->cpu_to_idx[cpu];
		144	if (!is_valid) {
		145	/* remove item */
		146	if (old_idx == IDX_INVALID) {
		147	/*
		148	* Nothing to remove if old_idx was invalid.
		149	* This could happen if a rq_offline_dl is
		150	* called for a CPU without -dl tasks running.
		151	*/
		152	goto out;
		153	}
		154	new_cpu = cp->elements[cp->size - 1].cpu;
		155	cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
		156	cp->elements[old_idx].cpu = new_cpu;
		157	cp->size--;
		158	cp->cpu_to_idx[new_cpu] = old_idx;
		159	cp->cpu_to_idx[cpu] = IDX_INVALID;
		160	while (old_idx > 0 && dl_time_before(
		161	cp->elements[parent(old_idx)].dl,
		162	cp->elements[old_idx].dl)) {
		163	cpudl_exchange(cp, old_idx, parent(old_idx));
		164	old_idx = parent(old_idx);
		165	}
		166	cpumask_set_cpu(cpu, cp->free_cpus);
		167	cpudl_heapify(cp, old_idx);
		168
		169	goto out;
		170	}
		171
		172	if (old_idx == IDX_INVALID) {
		173	cp->size++;
		174	cp->elements[cp->size - 1].dl = 0;
		175	cp->elements[cp->size - 1].cpu = cpu;
		176	cp->cpu_to_idx[cpu] = cp->size - 1;
		177	cpudl_change_key(cp, cp->size - 1, dl);
		178	cpumask_clear_cpu(cpu, cp->free_cpus);
		179	} else {
		180	cpudl_change_key(cp, old_idx, dl);
		181	}
		182
		183	out:
		184	raw_spin_unlock_irqrestore(&cp->lock, flags);
		185	}
		186
		187	/*
		188	* cpudl_init - initialize the cpudl structure
		189	* @cp: the cpudl max-heap context
		190	*/
		191	int cpudl_init(struct cpudl *cp)
		192	{
		193	int i;
		194
		195	memset(cp, 0, sizeof(*cp));
		196	raw_spin_lock_init(&cp->lock);
		197	cp->size = 0;
		198	for (i = 0; i < NR_CPUS; i++)
		199	cp->cpu_to_idx[i] = IDX_INVALID;
		200	if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
		201	return -ENOMEM;
		202	cpumask_setall(cp->free_cpus);
		203
		204	return 0;
		205	}
		206
		207	/*
		208	* cpudl_cleanup - clean up the cpudl structure
		209	* @cp: the cpudl max-heap context
		210	*/
		211	void cpudl_cleanup(struct cpudl *cp)
		212	{
		213	/*
		214	* nothing to do for the moment
		215	*/
		216	}


diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
		1	#ifndef _LINUX_CPUDL_H
		2	#define _LINUX_CPUDL_H
		3
		4	#include <linux/sched.h>
		5
		6	#define IDX_INVALID -1
		7
		8	struct array_item {
		9	u64 dl;
		10	int cpu;
		11	};
		12
		13	struct cpudl {
		14	raw_spinlock_t lock;
		15	int size;
		16	int cpu_to_idx[NR_CPUS];
		17	struct array_item elements[NR_CPUS];
		18	cpumask_var_t free_cpus;
		19	};
		20
		21
		22	#ifdef CONFIG_SMP
		23	int cpudl_find(struct cpudl cp, struct task_struct p,
		24	struct cpumask *later_mask);
		25	void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
		26	int cpudl_init(struct cpudl *cp);
		27	void cpudl_cleanup(struct cpudl *cp);
		28	#else
		29	#define cpudl_set(cp, cpu, dl) do { } while (0)
		30	#define cpudl_init() do { } while (0)
		31	#endif /* CONFIG_SMP */
		32
		33	#endif /* _LINUX_CPUDL_H */


diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 802188fb6338..0c6b1d089cd4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
16	*/	16	*/
17	#include "sched.h"	17	#include "sched.h"
18		18
		19	#include <linux/slab.h>
		20
19	struct dl_bandwidth def_dl_bandwidth;	21	struct dl_bandwidth def_dl_bandwidth;
20		22
21	static inline struct task_struct dl_task_of(struct sched_dl_entity dl_se)	23	static inline struct task_struct dl_task_of(struct sched_dl_entity dl_se)
@@ -640,6 +642,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
640	*/	642	*/
641	dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;	643	dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
642	dl_rq->earliest_dl.curr = deadline;	644	dl_rq->earliest_dl.curr = deadline;
		645	cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
643	} else if (dl_rq->earliest_dl.next == 0 \|\|	646	} else if (dl_rq->earliest_dl.next == 0 \|\|
644	dl_time_before(deadline, dl_rq->earliest_dl.next)) {	647	dl_time_before(deadline, dl_rq->earliest_dl.next)) {
645	/*	648	/*
@@ -663,6 +666,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
663	if (!dl_rq->dl_nr_running) {	666	if (!dl_rq->dl_nr_running) {
664	dl_rq->earliest_dl.curr = 0;	667	dl_rq->earliest_dl.curr = 0;
665	dl_rq->earliest_dl.next = 0;	668	dl_rq->earliest_dl.next = 0;
		669	cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
666	} else {	670	} else {
667	struct rb_node *leftmost = dl_rq->rb_leftmost;	671	struct rb_node *leftmost = dl_rq->rb_leftmost;
668	struct sched_dl_entity *entry;	672	struct sched_dl_entity *entry;
@@ -670,6 +674,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
670	entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);	674	entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
671	dl_rq->earliest_dl.curr = entry->deadline;	675	dl_rq->earliest_dl.curr = entry->deadline;
672	dl_rq->earliest_dl.next = next_deadline(rq);	676	dl_rq->earliest_dl.next = next_deadline(rq);
		677	cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
673	}	678	}
674	}	679	}
675		680
@@ -855,9 +860,6 @@ static void yield_task_dl(struct rq *rq)
855	#ifdef CONFIG_SMP	860	#ifdef CONFIG_SMP
856		861
857	static int find_later_rq(struct task_struct *task);	862	static int find_later_rq(struct task_struct *task);
858	static int latest_cpu_find(struct cpumask *span,
859	struct task_struct *task,
860	struct cpumask *later_mask);
861		863
862	static int	864	static int
863	select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)	865	select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
@@ -904,7 +906,7 @@ static void check_preempt_equal_dl(struct rq rq, struct task_struct p)
904	* let's hope p can move out.	906	* let's hope p can move out.
905	*/	907	*/
906	if (rq->curr->nr_cpus_allowed == 1 \|\|	908	if (rq->curr->nr_cpus_allowed == 1 \|\|
907	latest_cpu_find(rq->rd->span, rq->curr, NULL) == -1)	909	cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
908	return;	910	return;
909		911
910	/*	912	/*
@@ -912,7 +914,7 @@ static void check_preempt_equal_dl(struct rq rq, struct task_struct p)
912	* see if it is pushed or pulled somewhere else.	914	* see if it is pushed or pulled somewhere else.
913	*/	915	*/
914	if (p->nr_cpus_allowed != 1 &&	916	if (p->nr_cpus_allowed != 1 &&
915	latest_cpu_find(rq->rd->span, p, NULL) != -1)	917	cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
916	return;	918	return;
917		919
918	resched_task(rq->curr);	920	resched_task(rq->curr);
@@ -1085,39 +1087,6 @@ next_node:
1085	return NULL;	1087	return NULL;
1086	}	1088	}
1087		1089
1088	static int latest_cpu_find(struct cpumask *span,
1089	struct task_struct *task,
1090	struct cpumask *later_mask)
1091	{
1092	const struct sched_dl_entity *dl_se = &task->dl;
1093	int cpu, found = -1, best = 0;
1094	u64 max_dl = 0;
1095
1096	for_each_cpu(cpu, span) {
1097	struct rq *rq = cpu_rq(cpu);
1098	struct dl_rq *dl_rq = &rq->dl;
1099
1100	if (cpumask_test_cpu(cpu, &task->cpus_allowed) &&
1101	(!dl_rq->dl_nr_running \|\| dl_time_before(dl_se->deadline,
1102	dl_rq->earliest_dl.curr))) {
1103	if (later_mask)
1104	cpumask_set_cpu(cpu, later_mask);
1105	if (!best && !dl_rq->dl_nr_running) {
1106	best = 1;
1107	found = cpu;
1108	} else if (!best &&
1109	dl_time_before(max_dl,
1110	dl_rq->earliest_dl.curr)) {
1111	max_dl = dl_rq->earliest_dl.curr;
1112	found = cpu;
1113	}
1114	} else if (later_mask)
1115	cpumask_clear_cpu(cpu, later_mask);
1116	}
1117
1118	return found;
1119	}
1120
1121	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);	1090	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1122		1091
1123	static int find_later_rq(struct task_struct *task)	1092	static int find_later_rq(struct task_struct *task)
@@ -1134,7 +1103,8 @@ static int find_later_rq(struct task_struct *task)
1134	if (task->nr_cpus_allowed == 1)	1103	if (task->nr_cpus_allowed == 1)
1135	return -1;	1104	return -1;
1136		1105
1137	best_cpu = latest_cpu_find(task_rq(task)->rd->span, task, later_mask);	1106	best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
		1107	task, later_mask);
1138	if (best_cpu == -1)	1108	if (best_cpu == -1)
1139	return -1;	1109	return -1;
1140		1110
@@ -1510,6 +1480,9 @@ static void rq_online_dl(struct rq *rq)
1510	{	1480	{
1511	if (rq->dl.overloaded)	1481	if (rq->dl.overloaded)
1512	dl_set_overload(rq);	1482	dl_set_overload(rq);
		1483
		1484	if (rq->dl.dl_nr_running > 0)
		1485	cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1513	}	1486	}
1514		1487
1515	/* Assumes rq->lock is held */	1488	/* Assumes rq->lock is held */
@@ -1517,6 +1490,8 @@ static void rq_offline_dl(struct rq *rq)
1517	{	1490	{
1518	if (rq->dl.overloaded)	1491	if (rq->dl.overloaded)
1519	dl_clear_overload(rq);	1492	dl_clear_overload(rq);
		1493
		1494	cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1520	}	1495	}
1521		1496
1522	void init_sched_dl_class(void)	1497	void init_sched_dl_class(void)


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ad4f4fbd002e..2b7421db6c41 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -10,6 +10,7 @@
10	#include <linux/slab.h>	10	#include <linux/slab.h>
11		11
12	#include "cpupri.h"	12	#include "cpupri.h"
		13	#include "cpudeadline.h"
13	#include "cpuacct.h"	14	#include "cpuacct.h"
14		15
15	struct rq;	16	struct rq;
@@ -503,6 +504,7 @@ struct root_domain {
503	cpumask_var_t dlo_mask;	504	cpumask_var_t dlo_mask;
504	atomic_t dlo_count;	505	atomic_t dlo_count;
505	struct dl_bw dl_bw;	506	struct dl_bw dl_bw;
		507	struct cpudl cpudl;
506		508
507	/*	509	/*
508	* The "RT overload" flag: it gets set if a CPU has more than	510	* The "RT overload" flag: it gets set if a CPU has more than