1 files changed, 63 insertions, 26 deletions
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
        return cpupri;
 }
-#define for_each_cpupri_active(array, idx)                    \
-        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
        int                  idx      = 0;
        int                  task_pri = convert_prio(p->prio);
-        for_each_cpupri_active(cp->pri_active, idx) {
+        if (task_pri >= MAX_RT_PRIO)
-                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                return 0;
-                if (idx >= task_pri)
+        for (idx = 0; idx < task_pri; idx++) {
-                        break;
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                int skip = 0;
+                if (!atomic_read(&(vec)->count))
+                        skip = 1;
+                /*
+                 * When looking at the vector, we need to read the counter,
+                 * do a memory barrier, then read the mask.
+                 *
+                 * Note: This is still all racey, but we can deal with it.
+                 *  Ideally, we only want to look at masks that are set.
+                 *
+                 *  If a mask is not set, then the only thing wrong is that we
+                 *  did a little more work than necessary.
+                 *
+                 *  If we read a zero count but the mask is set, because of the
+                 *  memory barriers, that can only happen when the highest prio
+                 *  task for a run queue has left the run queue, in which case,
+                 *  it will be followed by a pull. If the task we are processing
+                 *  fails to find a proper place to go, that pull request will
+                 *  pull this task if the run queue is running at a lower
+                 *  priority.
+                 */
+                smp_rmb();
+                /* Need to do the rmb for every iteration */
+                if (skip)
+                        continue;
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
        int                 *currpri = &cp->cpu_to_pri[cpu];
        int                  oldpri  = *currpri;
-        unsigned long        flags;
+        int                  do_mb = 0;
        newpri = convert_prio(newpri);
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
         * If the cpu was currently mapped to a different value, we
         * need to map it to the new value then remove the old value.
         * Note, we must add the new value first, otherwise we risk the
-         * cpu being cleared from pri_active, and this cpu could be
+         * cpu being missed by the priority loop in cpupri_find.
-         * missed for a push or pull.
         */
        if (likely(newpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
                cpumask_set_cpu(cpu, vec->mask);
-                vec->count++;
+                /*
-                if (vec->count == 1)
+                 * When adding a new vector, we update the mask first,
-                        set_bit(newpri, cp->pri_active);
+                 * do a write memory barrier, and then update the count, to
+                 * make sure the vector is visible when count is set.
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
+                 */
+                smp_mb__before_atomic_inc();
+                atomic_inc(&(vec)->count);
+                do_mb = 1;
        }
        if (likely(oldpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
+                /*
+                 * Because the order of modification of the vec->count
-                vec->count--;
+                 * is important, we must make sure that the update
-                if (!vec->count)
+                 * of the new prio is seen before we decrement the
-                        clear_bit(oldpri, cp->pri_active);
+                 * old prio. This makes sure that the loop sees
+                 * one or the other when we raise the priority of
+                 * the run queue. We don't care about when we lower the
+                 * priority, as that will trigger an rt pull anyway.
+                 *
+                 * We only need to do a memory barrier if we updated
+                 * the new priority vec.
+                 */
+                if (do_mb)
+                        smp_mb__after_atomic_inc();
+                /*
+                 * When removing from the vector, we decrement the counter first
+                 * do a memory barrier and then clear the mask.
+                 */
+                atomic_dec(&(vec)->count);
+                smp_mb__after_atomic_inc();
                cpumask_clear_cpu(cpu, vec->mask);
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                raw_spin_lock_init(&vec->lock);
+                atomic_set(&vec->count, 0);
-                vec->count = 0;
                if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 2722dc1b4138..a86cf9d9eb11 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47	return cpupri;	47	return cpupri;
48	}	48	}
49		49
50	#define for_each_cpupri_active(array, idx) \
51	for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53	/**	50	/**
54	* cpupri_find - find the best (lowest-pri) CPU in the system	51	* cpupri_find - find the best (lowest-pri) CPU in the system
55	* @cp: The cpupri context	52	* @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri cp, struct task_struct p,
71	int idx = 0;	68	int idx = 0;
72	int task_pri = convert_prio(p->prio);	69	int task_pri = convert_prio(p->prio);
73		70
74	for_each_cpupri_active(cp->pri_active, idx) {	71	if (task_pri >= MAX_RT_PRIO)
75	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];	72	return 0;
76		73
77	if (idx >= task_pri)	74	for (idx = 0; idx < task_pri; idx++) {
78	break;	75	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
		76	int skip = 0;
		77
		78	if (!atomic_read(&(vec)->count))
		79	skip = 1;
		80	/*
		81	* When looking at the vector, we need to read the counter,
		82	* do a memory barrier, then read the mask.
		83	*
		84	* Note: This is still all racey, but we can deal with it.
		85	* Ideally, we only want to look at masks that are set.
		86	*
		87	* If a mask is not set, then the only thing wrong is that we
		88	* did a little more work than necessary.
		89	*
		90	* If we read a zero count but the mask is set, because of the
		91	* memory barriers, that can only happen when the highest prio
		92	* task for a run queue has left the run queue, in which case,
		93	* it will be followed by a pull. If the task we are processing
		94	* fails to find a proper place to go, that pull request will
		95	* pull this task if the run queue is running at a lower
		96	* priority.
		97	*/
		98	smp_rmb();
		99
		100	/* Need to do the rmb for every iteration */
		101	if (skip)
		102	continue;
79		103
80	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)	104	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81	continue;	105	continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115	{	139	{
116	int *currpri = &cp->cpu_to_pri[cpu];	140	int *currpri = &cp->cpu_to_pri[cpu];
117	int oldpri = *currpri;	141	int oldpri = *currpri;
118	unsigned long flags;	142	int do_mb = 0;
119		143
120	newpri = convert_prio(newpri);	144	newpri = convert_prio(newpri);
121		145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128	* If the cpu was currently mapped to a different value, we	152	* If the cpu was currently mapped to a different value, we
129	* need to map it to the new value then remove the old value.	153	* need to map it to the new value then remove the old value.
130	* Note, we must add the new value first, otherwise we risk the	154	* Note, we must add the new value first, otherwise we risk the
131	* cpu being cleared from pri_active, and this cpu could be	155	* cpu being missed by the priority loop in cpupri_find.
132	* missed for a push or pull.
133	*/	156	*/
134	if (likely(newpri != CPUPRI_INVALID)) {	157	if (likely(newpri != CPUPRI_INVALID)) {
135	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];	158	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136		159
137	raw_spin_lock_irqsave(&vec->lock, flags);
138
139	cpumask_set_cpu(cpu, vec->mask);	160	cpumask_set_cpu(cpu, vec->mask);
140	vec->count++;	161	/*
141	if (vec->count == 1)	162	* When adding a new vector, we update the mask first,
142	set_bit(newpri, cp->pri_active);	163	* do a write memory barrier, and then update the count, to
143		164	* make sure the vector is visible when count is set.
144	raw_spin_unlock_irqrestore(&vec->lock, flags);	165	*/
		166	smp_mb__before_atomic_inc();
		167	atomic_inc(&(vec)->count);
		168	do_mb = 1;
145	}	169	}
146	if (likely(oldpri != CPUPRI_INVALID)) {	170	if (likely(oldpri != CPUPRI_INVALID)) {
147	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];	171	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148		172
149	raw_spin_lock_irqsave(&vec->lock, flags);	173	/*
150		174	* Because the order of modification of the vec->count
151	vec->count--;	175	* is important, we must make sure that the update
152	if (!vec->count)	176	* of the new prio is seen before we decrement the
153	clear_bit(oldpri, cp->pri_active);	177	* old prio. This makes sure that the loop sees
		178	* one or the other when we raise the priority of
		179	* the run queue. We don't care about when we lower the
		180	* priority, as that will trigger an rt pull anyway.
		181	*
		182	* We only need to do a memory barrier if we updated
		183	* the new priority vec.
		184	*/
		185	if (do_mb)
		186	smp_mb__after_atomic_inc();
		187
		188	/*
		189	* When removing from the vector, we decrement the counter first
		190	* do a memory barrier and then clear the mask.
		191	*/
		192	atomic_dec(&(vec)->count);
		193	smp_mb__after_atomic_inc();
154	cpumask_clear_cpu(cpu, vec->mask);	194	cpumask_clear_cpu(cpu, vec->mask);
155
156	raw_spin_unlock_irqrestore(&vec->lock, flags);
157	}	195	}
158		196
159	*currpri = newpri;	197	*currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {	213	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176	struct cpupri_vec *vec = &cp->pri_to_cpu[i];	214	struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177		215
178	raw_spin_lock_init(&vec->lock);	216	atomic_set(&vec->count, 0);
179	vec->count = 0;
180	if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))	217	if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181	goto cleanup;	218	goto cleanup;
182	}	219	}