sched: use a 2-d bitmap for searching lowest-pri CPU

The current code use a linear algorithm which causes scaling issues on larger SMP machines. This patch replaces that algorithm with a 2-dimensional bitmap to reduce latencies in the wake-up path. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Acked-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Gregory Haskins <ghaskins@novell.com> 2008-05-12 15:21:01 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-06-06 09:19:28 -0400
commit: 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 (patch)
tree: 25f4da14ec32927742db9f599ac779b4e83d1763 /kernel
parent: f333fdc9098b71e2687e4e9b6349fcb352960d66 (diff)
5 files changed, 239 insertions, 77 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..ecdd2d335639 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_SMP) += sched_cpupri.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched.c b/kernel/sched.c
index aa960b84b881..8a1257b65560 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include "sched_cpupri.h"
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -450,6 +452,9 @@ struct root_domain {
         */
        cpumask_t rto_mask;
        atomic_t rto_count;
+#ifdef CONFIG_SMP
+        struct cpupri cpupri;
+#endif
 };
 /*
@@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd)
        cpus_clear(rd->span);
        cpus_clear(rd->online);
+        cpupri_init(&rd->cpupri);
 }
 static void init_defrootdomain(void)
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ *  kernel/sched_cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+#include "sched_cpupri.h"
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+        int cpupri;
+        if (prio == CPUPRI_INVALID)
+                cpupri = CPUPRI_INVALID;
+        else if (prio == MAX_PRIO)
+                cpupri = CPUPRI_IDLE;
+        else if (prio >= MAX_RT_PRIO)
+                cpupri = CPUPRI_NORMAL;
+        else
+                cpupri = MAX_RT_PRIO - prio + 1;
+        return cpupri;
+}
+#define for_each_cpupri_active(array, idx)                    \
+  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+       idx < CPUPRI_NR_PRIORITIES;                            \
+       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+                cpumask_t *lowest_mask)
+{
+        int                  idx      = 0;
+        int                  task_pri = convert_prio(p->prio);
+        for_each_cpupri_active(cp->pri_active, idx) {
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                cpumask_t mask;
+                if (idx >= task_pri)
+                        break;
+                cpus_and(mask, p->cpus_allowed, vec->mask);
+                if (cpus_empty(mask))
+                        continue;
+                *lowest_mask = mask;
+                return 1;
+        }
+        return 0;
+}
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+        int                 *currpri = &cp->cpu_to_pri[cpu];
+        int                  oldpri  = *currpri;
+        unsigned long        flags;
+        newpri = convert_prio(newpri);
+        BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+        if (newpri == oldpri)
+                return;
+        /*
+         * If the cpu was currently mapped to a different value, we
+         * first need to unmap the old value
+         */
+        if (likely(oldpri != CPUPRI_INVALID)) {
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+                spin_lock_irqsave(&vec->lock, flags);
+                vec->count--;
+                if (!vec->count)
+                        clear_bit(oldpri, cp->pri_active);
+                cpu_clear(cpu, vec->mask);
+                spin_unlock_irqrestore(&vec->lock, flags);
+        }
+        if (likely(newpri != CPUPRI_INVALID)) {
+                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+                spin_lock_irqsave(&vec->lock, flags);
+                cpu_set(cpu, vec->mask);
+                vec->count++;
+                if (vec->count == 1)
+                        set_bit(newpri, cp->pri_active);
+                spin_unlock_irqrestore(&vec->lock, flags);
+        }
+        *currpri = newpri;
+}
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+        int i;
+        memset(cp, 0, sizeof(*cp));
+        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+                spin_lock_init(&vec->lock);
+                vec->count = 0;
+                cpus_clear(vec->mask);
+        }
+        for_each_possible_cpu(i)
+                cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..0b6a3d110fac
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+#include <linux/sched.h>
+#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
+#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+struct cpupri_vec {
+        spinlock_t lock;
+        int        count;
+        cpumask_t  mask;
+};
+struct cpupri {
+        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+        long              pri_active[CPUPRI_NR_PRI_WORDS];
+        int               cpu_to_pri[NR_CPUS];
+};
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+                 struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fefed39fafd8..44b06d75416e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+        if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+                struct rq *rq = rq_of_rt_rq(rt_rq);
                rt_rq->highest_prio = rt_se_prio(rt_se);
+                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+        }
 #endif
 #ifdef CONFIG_SMP
        if (rt_se->nr_cpus_allowed > 1) {
@@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_SMP
+        int highest_prio = rt_rq->highest_prio;
+#endif
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running--;
@@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
                rq->rt.rt_nr_migratory--;
        }
+        if (rt_rq->highest_prio != highest_prio) {
+                struct rq *rq = rq_of_rt_rq(rt_rq);
+                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+        }
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
-        int       lowest_prio = -1;
-        int       lowest_cpu  = -1;
-        int       count       = 0;
-        int       cpu;
-        cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-        /*
-         * Scan each rq for the lowest prio.
-         */
-        for_each_cpu_mask(cpu, *lowest_mask) {
-                struct rq *rq = cpu_rq(cpu);
-                /* We look for lowest RT prio or non-rt CPU */
-                if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                        /*
-                         * if we already found a low RT queue
-                         * and now we found this non-rt queue
-                         * clear the mask and set our bit.
-                         * Otherwise just return the queue as is
-                         * and the count==1 will cause the algorithm
-                         * to use the first bit found.
-                         */
-                        if (lowest_cpu != -1) {
-                                cpus_clear(*lowest_mask);
-                                cpu_set(rq->cpu, *lowest_mask);
-                        }
-                        return 1;
-                }
-                /* no locking for now */
-                if ((rq->rt.highest_prio > task->prio)
-                    && (rq->rt.highest_prio >= lowest_prio)) {
-                        if (rq->rt.highest_prio > lowest_prio) {
-                                /* new low - clear old data */
-                                lowest_prio = rq->rt.highest_prio;
-                                lowest_cpu = cpu;
-                                count = 0;
-                        }
-                        count++;
-                } else
-                        cpu_clear(cpu, *lowest_mask);
-        }
-        /*
-         * Clear out all the set bits that represent
-         * runqueues that were of higher prio than
-         * the lowest_prio.
-         */
-        if (lowest_cpu > 0) {
-                /*
-                 * Perhaps we could add another cpumask op to
-                 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-                 * Then that could be optimized to use memset and such.
-                 */
-                for_each_cpu_mask(cpu, *lowest_mask) {
-                        if (cpu >= lowest_cpu)
-                                break;
-                        cpu_clear(cpu, *lowest_mask);
-                }
-        }
-        return count;
-}
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
        int first;
@@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task)
        cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
-        int count    = find_lowest_cpus(task, lowest_mask);
-        if (!count)
+        if (task->rt.nr_cpus_allowed == 1)
-                return -1; /* No targets found */
+                return -1; /* No other targets possible */
-        /*
+        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
-         * There is no sense in performing an optimal search if only one
+                return -1; /* No targets found */
-         * target is found.
-         */
-        if (count == 1)
-                return first_cpu(*lowest_mask);
        /*
         * At this point we have built a mask of cpus representing the
@@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq)
 {
        if (rq->rt.overloaded)
                rt_set_overload(rq);
+        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 /* Assumes rq->lock is held */
@@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq)
 {
        if (rq->rt.overloaded)
                rt_clear_overload(rq);
+        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 /*
author	Gregory Haskins <ghaskins@novell.com>	2008-05-12 15:21:01 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-06-06 09:19:28 -0400
commit	6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 (patch)
tree	25f4da14ec32927742db9f599ac779b4e83d1763 /kernel
parent	f333fdc9098b71e2687e4e9b6349fcb352960d66 (diff)

diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9d..ecdd2d335639 100644 --- a/kernel/Makefile +++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69	obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o	69	obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70	obj-$(CONFIG_MARKERS) += marker.o	70	obj-$(CONFIG_MARKERS) += marker.o
71	obj-$(CONFIG_LATENCYTOP) += latencytop.o	71	obj-$(CONFIG_LATENCYTOP) += latencytop.o
		72	obj-$(CONFIG_SMP) += sched_cpupri.o
72		73
73	ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)	74	ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74	# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is	75	# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is


diff --git a/kernel/sched.c b/kernel/sched.c index aa960b84b881..8a1257b65560 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -74,6 +74,8 @@
74	#include <asm/tlb.h>	74	#include <asm/tlb.h>
75	#include <asm/irq_regs.h>	75	#include <asm/irq_regs.h>
76		76
		77	#include "sched_cpupri.h"
		78
77	/*	79	/*
78	* Convert user-nice values [ -20 ... 0 ... 19 ]	80	* Convert user-nice values [ -20 ... 0 ... 19 ]
79	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],	81	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -450,6 +452,9 @@ struct root_domain {
450	*/	452	*/
451	cpumask_t rto_mask;	453	cpumask_t rto_mask;
452	atomic_t rto_count;	454	atomic_t rto_count;
		455	#ifdef CONFIG_SMP
		456	struct cpupri cpupri;
		457	#endif
453	};	458	};
454		459
455	/*	460	/*
@@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd)
6392		6397
6393	cpus_clear(rd->span);	6398	cpus_clear(rd->span);
6394	cpus_clear(rd->online);	6399	cpus_clear(rd->online);
		6400
		6401	cpupri_init(&rd->cpupri);
6395	}	6402	}
6396		6403
6397	static void init_defrootdomain(void)	6404	static void init_defrootdomain(void)


diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 000000000000..52154fefab7e --- /dev/null +++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
		1	/*
		2	* kernel/sched_cpupri.c
		3	*
		4	* CPU priority management
		5	*
		6	* Copyright (C) 2007-2008 Novell
		7	*
		8	* Author: Gregory Haskins <ghaskins@novell.com>
		9	*
		10	* This code tracks the priority of each CPU so that global migration
		11	* decisions are easy to calculate. Each CPU can be in a state as follows:
		12	*
		13	* (INVALID), IDLE, NORMAL, RT1, ... RT99
		14	*
		15	* going from the lowest priority to the highest. CPUs in the INVALID state
		16	* are not eligible for routing. The system maintains this state with
		17	* a 2 dimensional bitmap (the first for priority class, the second for cpus
		18	* in that class). Therefore a typical application without affinity
		19	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
		20	* searches). For tasks with affinity restrictions, the algorithm has a
		21	* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
		22	* yields the worst case search is fairly contrived.
		23	*
		24	* This program is free software; you can redistribute it and/or
		25	* modify it under the terms of the GNU General Public License
		26	* as published by the Free Software Foundation; version 2
		27	* of the License.
		28	*/
		29
		30	#include "sched_cpupri.h"
		31
		32	/* Convert between a 140 based task->prio, and our 102 based cpupri */
		33	static int convert_prio(int prio)
		34	{
		35	int cpupri;
		36
		37	if (prio == CPUPRI_INVALID)
		38	cpupri = CPUPRI_INVALID;
		39	else if (prio == MAX_PRIO)
		40	cpupri = CPUPRI_IDLE;
		41	else if (prio >= MAX_RT_PRIO)
		42	cpupri = CPUPRI_NORMAL;
		43	else
		44	cpupri = MAX_RT_PRIO - prio + 1;
		45
		46	return cpupri;
		47	}
		48
		49	#define for_each_cpupri_active(array, idx) \
		50	for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
		51	idx < CPUPRI_NR_PRIORITIES; \
		52	idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
		53
		54	/**
		55	* cpupri_find - find the best (lowest-pri) CPU in the system
		56	* @cp: The cpupri context
		57	* @p: The task
		58	* @lowest_mask: A mask to fill in with selected CPUs
		59	*
		60	* Note: This function returns the recommended CPUs as calculated during the
		61	* current invokation. By the time the call returns, the CPUs may have in
		62	* fact changed priorities any number of times. While not ideal, it is not
		63	* an issue of correctness since the normal rebalancer logic will correct
		64	* any discrepancies created by racing against the uncertainty of the current
		65	* priority configuration.
		66	*
		67	* Returns: (int)bool - CPUs were found
		68	*/
		69	int cpupri_find(struct cpupri cp, struct task_struct p,
		70	cpumask_t *lowest_mask)
		71	{
		72	int idx = 0;
		73	int task_pri = convert_prio(p->prio);
		74
		75	for_each_cpupri_active(cp->pri_active, idx) {
		76	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
		77	cpumask_t mask;
		78
		79	if (idx >= task_pri)
		80	break;
		81
		82	cpus_and(mask, p->cpus_allowed, vec->mask);
		83
		84	if (cpus_empty(mask))
		85	continue;
		86
		87	*lowest_mask = mask;
		88	return 1;
		89	}
		90
		91	return 0;
		92	}
		93
		94	/**
		95	* cpupri_set - update the cpu priority setting
		96	* @cp: The cpupri context
		97	* @cpu: The target cpu
		98	* @pri: The priority (INVALID-RT99) to assign to this CPU
		99	*
		100	* Note: Assumes cpu_rq(cpu)->lock is locked
		101	*
		102	* Returns: (void)
		103	*/
		104	void cpupri_set(struct cpupri *cp, int cpu, int newpri)
		105	{
		106	int *currpri = &cp->cpu_to_pri[cpu];
		107	int oldpri = *currpri;
		108	unsigned long flags;
		109
		110	newpri = convert_prio(newpri);
		111
		112	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
		113
		114	if (newpri == oldpri)
		115	return;
		116
		117	/*
		118	* If the cpu was currently mapped to a different value, we
		119	* first need to unmap the old value
		120	*/
		121	if (likely(oldpri != CPUPRI_INVALID)) {
		122	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
		123
		124	spin_lock_irqsave(&vec->lock, flags);
		125
		126	vec->count--;
		127	if (!vec->count)
		128	clear_bit(oldpri, cp->pri_active);
		129	cpu_clear(cpu, vec->mask);
		130
		131	spin_unlock_irqrestore(&vec->lock, flags);
		132	}
		133
		134	if (likely(newpri != CPUPRI_INVALID)) {
		135	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
		136
		137	spin_lock_irqsave(&vec->lock, flags);
		138
		139	cpu_set(cpu, vec->mask);
		140	vec->count++;
		141	if (vec->count == 1)
		142	set_bit(newpri, cp->pri_active);
		143
		144	spin_unlock_irqrestore(&vec->lock, flags);
		145	}
		146
		147	*currpri = newpri;
		148	}
		149
		150	/**
		151	* cpupri_init - initialize the cpupri structure
		152	* @cp: The cpupri context
		153	*
		154	* Returns: (void)
		155	*/
		156	void cpupri_init(struct cpupri *cp)
		157	{
		158	int i;
		159
		160	memset(cp, 0, sizeof(*cp));
		161
		162	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
		163	struct cpupri_vec *vec = &cp->pri_to_cpu[i];
		164
		165	spin_lock_init(&vec->lock);
		166	vec->count = 0;
		167	cpus_clear(vec->mask);
		168	}
		169
		170	for_each_possible_cpu(i)
		171	cp->cpu_to_pri[i] = CPUPRI_INVALID;
		172	}
		173
		174


diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 000000000000..0b6a3d110fac --- /dev/null +++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
		1	#ifndef _LINUX_CPUPRI_H
		2	#define _LINUX_CPUPRI_H
		3
		4	#include <linux/sched.h>
		5
		6	#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
		7	#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
		8
		9	#define CPUPRI_INVALID -1
		10	#define CPUPRI_IDLE 0
		11	#define CPUPRI_NORMAL 1
		12	/* values 2-101 are RT priorities 0-99 */
		13
		14	struct cpupri_vec {
		15	spinlock_t lock;
		16	int count;
		17	cpumask_t mask;
		18	};
		19
		20	struct cpupri {
		21	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
		22	long pri_active[CPUPRI_NR_PRI_WORDS];
		23	int cpu_to_pri[NR_CPUS];
		24	};
		25
		26	#ifdef CONFIG_SMP
		27	int cpupri_find(struct cpupri *cp,
		28	struct task_struct p, cpumask_t lowest_mask);
		29	void cpupri_set(struct cpupri *cp, int cpu, int pri);
		30	void cpupri_init(struct cpupri *cp);
		31	#else
		32	#define cpupri_set(cp, cpu, pri) do { } while (0)
		33	#define cpupri_init() do { } while (0)
		34	#endif
		35
		36	#endif /* _LINUX_CPUPRI_H */


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fefed39fafd8..44b06d75416e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
391	WARN_ON(!rt_prio(rt_se_prio(rt_se)));	391	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
392	rt_rq->rt_nr_running++;	392	rt_rq->rt_nr_running++;
393	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	393	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
394	if (rt_se_prio(rt_se) < rt_rq->highest_prio)	394	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
		395	struct rq *rq = rq_of_rt_rq(rt_rq);
395	rt_rq->highest_prio = rt_se_prio(rt_se);	396	rt_rq->highest_prio = rt_se_prio(rt_se);
		397	cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
		398	}
396	#endif	399	#endif
397	#ifdef CONFIG_SMP	400	#ifdef CONFIG_SMP
398	if (rt_se->nr_cpus_allowed > 1) {	401	if (rt_se->nr_cpus_allowed > 1) {
@@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
416	static inline	419	static inline
417	void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)	420	void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
418	{	421	{
		422	#ifdef CONFIG_SMP
		423	int highest_prio = rt_rq->highest_prio;
		424	#endif
		425
419	WARN_ON(!rt_prio(rt_se_prio(rt_se)));	426	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
420	WARN_ON(!rt_rq->rt_nr_running);	427	WARN_ON(!rt_rq->rt_nr_running);
421	rt_rq->rt_nr_running--;	428	rt_rq->rt_nr_running--;
@@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
439	rq->rt.rt_nr_migratory--;	446	rq->rt.rt_nr_migratory--;
440	}	447	}
441		448
		449	if (rt_rq->highest_prio != highest_prio) {
		450	struct rq *rq = rq_of_rt_rq(rt_rq);
		451	cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
		452	}
		453
442	update_rt_migration(rq_of_rt_rq(rt_rq));	454	update_rt_migration(rq_of_rt_rq(rt_rq));
443	#endif /* CONFIG_SMP */	455	#endif /* CONFIG_SMP */
444	#ifdef CONFIG_RT_GROUP_SCHED	456	#ifdef CONFIG_RT_GROUP_SCHED
@@ -763,73 +775,6 @@ static struct task_struct pick_next_highest_task_rt(struct rq rq, int cpu)
763		775
764	static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);	776	static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
765		777
766	static int find_lowest_cpus(struct task_struct task, cpumask_t lowest_mask)
767	{
768	int lowest_prio = -1;
769	int lowest_cpu = -1;
770	int count = 0;
771	int cpu;
772
773	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
774
775	/*
776	* Scan each rq for the lowest prio.
777	*/
778	for_each_cpu_mask(cpu, *lowest_mask) {
779	struct rq *rq = cpu_rq(cpu);
780
781	/* We look for lowest RT prio or non-rt CPU */
782	if (rq->rt.highest_prio >= MAX_RT_PRIO) {
783	/*
784	* if we already found a low RT queue
785	* and now we found this non-rt queue
786	* clear the mask and set our bit.
787	* Otherwise just return the queue as is
788	* and the count==1 will cause the algorithm
789	* to use the first bit found.
790	*/
791	if (lowest_cpu != -1) {
792	cpus_clear(*lowest_mask);
793	cpu_set(rq->cpu, *lowest_mask);
794	}
795	return 1;
796	}
797
798	/* no locking for now */
799	if ((rq->rt.highest_prio > task->prio)
800	&& (rq->rt.highest_prio >= lowest_prio)) {
801	if (rq->rt.highest_prio > lowest_prio) {
802	/* new low - clear old data */
803	lowest_prio = rq->rt.highest_prio;
804	lowest_cpu = cpu;
805	count = 0;
806	}
807	count++;
808	} else
809	cpu_clear(cpu, *lowest_mask);
810	}
811
812	/*
813	* Clear out all the set bits that represent
814	* runqueues that were of higher prio than
815	* the lowest_prio.
816	*/
817	if (lowest_cpu > 0) {
818	/*
819	* Perhaps we could add another cpumask op to
820	* zero out bits. Like cpu_zero_bits(cpumask, nrbits);
821	* Then that could be optimized to use memset and such.
822	*/
823	for_each_cpu_mask(cpu, *lowest_mask) {
824	if (cpu >= lowest_cpu)
825	break;
826	cpu_clear(cpu, *lowest_mask);
827	}
828	}
829
830	return count;
831	}
832
833	static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)	778	static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
834	{	779	{
835	int first;	780	int first;
@@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task)
851	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);	796	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
852	int this_cpu = smp_processor_id();	797	int this_cpu = smp_processor_id();
853	int cpu = task_cpu(task);	798	int cpu = task_cpu(task);
854	int count = find_lowest_cpus(task, lowest_mask);
855		799
856	if (!count)	800	if (task->rt.nr_cpus_allowed == 1)
857	return -1; /* No targets found */	801	return -1; /* No other targets possible */
858		802
859	/*	803	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
860	* There is no sense in performing an optimal search if only one	804	return -1; /* No targets found */
861	* target is found.
862	*/
863	if (count == 1)
864	return first_cpu(*lowest_mask);
865		805
866	/*	806	/*
867	* At this point we have built a mask of cpus representing the	807	* At this point we have built a mask of cpus representing the
@@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq)
1218	{	1158	{
1219	if (rq->rt.overloaded)	1159	if (rq->rt.overloaded)
1220	rt_set_overload(rq);	1160	rt_set_overload(rq);
		1161
		1162	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1221	}	1163	}
1222		1164
1223	/* Assumes rq->lock is held */	1165	/* Assumes rq->lock is held */
@@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq)
1225	{	1167	{
1226	if (rq->rt.overloaded)	1168	if (rq->rt.overloaded)
1227	rt_clear_overload(rq);	1169	rt_clear_overload(rq);
		1170
		1171	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1228	}	1172	}
1229		1173
1230	/*	1174	/*