aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGregory Haskins <ghaskins@novell.com>2008-05-12 15:21:01 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-06 09:19:28 -0400
commit6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 (patch)
tree25f4da14ec32927742db9f599ac779b4e83d1763
parentf333fdc9098b71e2687e4e9b6349fcb352960d66 (diff)
sched: use a 2-d bitmap for searching lowest-pri CPU
The current code use a linear algorithm which causes scaling issues on larger SMP machines. This patch replaces that algorithm with a 2-dimensional bitmap to reduce latencies in the wake-up path. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Acked-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/sched.c7
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_rt.c98
5 files changed, 239 insertions, 77 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..ecdd2d335639 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 70obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 71obj-$(CONFIG_LATENCYTOP) += latencytop.o
72obj-$(CONFIG_SMP) += sched_cpupri.o
72 73
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 74ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 75# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched.c b/kernel/sched.c
index aa960b84b881..8a1257b65560 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
74#include <asm/tlb.h> 74#include <asm/tlb.h>
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77#include "sched_cpupri.h"
78
77/* 79/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 80 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 81 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -450,6 +452,9 @@ struct root_domain {
450 */ 452 */
451 cpumask_t rto_mask; 453 cpumask_t rto_mask;
452 atomic_t rto_count; 454 atomic_t rto_count;
455#ifdef CONFIG_SMP
456 struct cpupri cpupri;
457#endif
453}; 458};
454 459
455/* 460/*
@@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd)
6392 6397
6393 cpus_clear(rd->span); 6398 cpus_clear(rd->span);
6394 cpus_clear(rd->online); 6399 cpus_clear(rd->online);
6400
6401 cpupri_init(&rd->cpupri);
6395} 6402}
6396 6403
6397static void init_defrootdomain(void) 6404static void init_defrootdomain(void)
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
1/*
2 * kernel/sched_cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include "sched_cpupri.h"
31
32/* Convert between a 140 based task->prio, and our 102 based cpupri */
33static int convert_prio(int prio)
34{
35 int cpupri;
36
37 if (prio == CPUPRI_INVALID)
38 cpupri = CPUPRI_INVALID;
39 else if (prio == MAX_PRIO)
40 cpupri = CPUPRI_IDLE;
41 else if (prio >= MAX_RT_PRIO)
42 cpupri = CPUPRI_NORMAL;
43 else
44 cpupri = MAX_RT_PRIO - prio + 1;
45
46 return cpupri;
47}
48
49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53
54/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context
57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs
59 *
60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current
65 * priority configuration.
66 *
67 * Returns: (int)bool - CPUs were found
68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask)
71{
72 int idx = 0;
73 int task_pri = convert_prio(p->prio);
74
75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78
79 if (idx >= task_pri)
80 break;
81
82 cpus_and(mask, p->cpus_allowed, vec->mask);
83
84 if (cpus_empty(mask))
85 continue;
86
87 *lowest_mask = mask;
88 return 1;
89 }
90
91 return 0;
92}
93
94/**
95 * cpupri_set - update the cpu priority setting
96 * @cp: The cpupri context
97 * @cpu: The target cpu
98 * @pri: The priority (INVALID-RT99) to assign to this CPU
99 *
100 * Note: Assumes cpu_rq(cpu)->lock is locked
101 *
102 * Returns: (void)
103 */
104void cpupri_set(struct cpupri *cp, int cpu, int newpri)
105{
106 int *currpri = &cp->cpu_to_pri[cpu];
107 int oldpri = *currpri;
108 unsigned long flags;
109
110 newpri = convert_prio(newpri);
111
112 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
113
114 if (newpri == oldpri)
115 return;
116
117 /*
118 * If the cpu was currently mapped to a different value, we
119 * first need to unmap the old value
120 */
121 if (likely(oldpri != CPUPRI_INVALID)) {
122 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
123
124 spin_lock_irqsave(&vec->lock, flags);
125
126 vec->count--;
127 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask);
130
131 spin_unlock_irqrestore(&vec->lock, flags);
132 }
133
134 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136
137 spin_lock_irqsave(&vec->lock, flags);
138
139 cpu_set(cpu, vec->mask);
140 vec->count++;
141 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active);
143
144 spin_unlock_irqrestore(&vec->lock, flags);
145 }
146
147 *currpri = newpri;
148}
149
150/**
151 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context
153 *
154 * Returns: (void)
155 */
156void cpupri_init(struct cpupri *cp)
157{
158 int i;
159
160 memset(cp, 0, sizeof(*cp));
161
162 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
163 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
164
165 spin_lock_init(&vec->lock);
166 vec->count = 0;
167 cpus_clear(vec->mask);
168 }
169
170 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID;
172}
173
174
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..0b6a3d110fac
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
7#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
8
9#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */
13
14struct cpupri_vec {
15 spinlock_t lock;
16 int count;
17 cpumask_t mask;
18};
19
20struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS];
24};
25
26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp);
31#else
32#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0)
34#endif
35
36#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fefed39fafd8..44b06d75416e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
391 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 391 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
392 rt_rq->rt_nr_running++; 392 rt_rq->rt_nr_running++;
393#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 393#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
394 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 394 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
395 struct rq *rq = rq_of_rt_rq(rt_rq);
395 rt_rq->highest_prio = rt_se_prio(rt_se); 396 rt_rq->highest_prio = rt_se_prio(rt_se);
397 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
398 }
396#endif 399#endif
397#ifdef CONFIG_SMP 400#ifdef CONFIG_SMP
398 if (rt_se->nr_cpus_allowed > 1) { 401 if (rt_se->nr_cpus_allowed > 1) {
@@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
416static inline 419static inline
417void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 420void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
418{ 421{
422#ifdef CONFIG_SMP
423 int highest_prio = rt_rq->highest_prio;
424#endif
425
419 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 426 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
420 WARN_ON(!rt_rq->rt_nr_running); 427 WARN_ON(!rt_rq->rt_nr_running);
421 rt_rq->rt_nr_running--; 428 rt_rq->rt_nr_running--;
@@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
439 rq->rt.rt_nr_migratory--; 446 rq->rt.rt_nr_migratory--;
440 } 447 }
441 448
449 if (rt_rq->highest_prio != highest_prio) {
450 struct rq *rq = rq_of_rt_rq(rt_rq);
451 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
452 }
453
442 update_rt_migration(rq_of_rt_rq(rt_rq)); 454 update_rt_migration(rq_of_rt_rq(rt_rq));
443#endif /* CONFIG_SMP */ 455#endif /* CONFIG_SMP */
444#ifdef CONFIG_RT_GROUP_SCHED 456#ifdef CONFIG_RT_GROUP_SCHED
@@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
763 775
764static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 776static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
765 777
766static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
767{
768 int lowest_prio = -1;
769 int lowest_cpu = -1;
770 int count = 0;
771 int cpu;
772
773 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
774
775 /*
776 * Scan each rq for the lowest prio.
777 */
778 for_each_cpu_mask(cpu, *lowest_mask) {
779 struct rq *rq = cpu_rq(cpu);
780
781 /* We look for lowest RT prio or non-rt CPU */
782 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
783 /*
784 * if we already found a low RT queue
785 * and now we found this non-rt queue
786 * clear the mask and set our bit.
787 * Otherwise just return the queue as is
788 * and the count==1 will cause the algorithm
789 * to use the first bit found.
790 */
791 if (lowest_cpu != -1) {
792 cpus_clear(*lowest_mask);
793 cpu_set(rq->cpu, *lowest_mask);
794 }
795 return 1;
796 }
797
798 /* no locking for now */
799 if ((rq->rt.highest_prio > task->prio)
800 && (rq->rt.highest_prio >= lowest_prio)) {
801 if (rq->rt.highest_prio > lowest_prio) {
802 /* new low - clear old data */
803 lowest_prio = rq->rt.highest_prio;
804 lowest_cpu = cpu;
805 count = 0;
806 }
807 count++;
808 } else
809 cpu_clear(cpu, *lowest_mask);
810 }
811
812 /*
813 * Clear out all the set bits that represent
814 * runqueues that were of higher prio than
815 * the lowest_prio.
816 */
817 if (lowest_cpu > 0) {
818 /*
819 * Perhaps we could add another cpumask op to
820 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
821 * Then that could be optimized to use memset and such.
822 */
823 for_each_cpu_mask(cpu, *lowest_mask) {
824 if (cpu >= lowest_cpu)
825 break;
826 cpu_clear(cpu, *lowest_mask);
827 }
828 }
829
830 return count;
831}
832
833static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 778static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
834{ 779{
835 int first; 780 int first;
@@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task)
851 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 796 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
852 int this_cpu = smp_processor_id(); 797 int this_cpu = smp_processor_id();
853 int cpu = task_cpu(task); 798 int cpu = task_cpu(task);
854 int count = find_lowest_cpus(task, lowest_mask);
855 799
856 if (!count) 800 if (task->rt.nr_cpus_allowed == 1)
857 return -1; /* No targets found */ 801 return -1; /* No other targets possible */
858 802
859 /* 803 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
860 * There is no sense in performing an optimal search if only one 804 return -1; /* No targets found */
861 * target is found.
862 */
863 if (count == 1)
864 return first_cpu(*lowest_mask);
865 805
866 /* 806 /*
867 * At this point we have built a mask of cpus representing the 807 * At this point we have built a mask of cpus representing the
@@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq)
1218{ 1158{
1219 if (rq->rt.overloaded) 1159 if (rq->rt.overloaded)
1220 rt_set_overload(rq); 1160 rt_set_overload(rq);
1161
1162 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1221} 1163}
1222 1164
1223/* Assumes rq->lock is held */ 1165/* Assumes rq->lock is held */
@@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq)
1225{ 1167{
1226 if (rq->rt.overloaded) 1168 if (rq->rt.overloaded)
1227 rt_clear_overload(rq); 1169 rt_clear_overload(rq);
1170
1171 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1228} 1172}
1229 1173
1230/* 1174/*