diff options
-rw-r--r-- | Documentation/arm/vlocks.txt | 211 | ||||
-rw-r--r-- | arch/arm/common/vlock.S | 108 | ||||
-rw-r--r-- | arch/arm/common/vlock.h | 29 |
3 files changed, 348 insertions, 0 deletions
diff --git a/Documentation/arm/vlocks.txt b/Documentation/arm/vlocks.txt new file mode 100644 index 000000000000..415960a9bab0 --- /dev/null +++ b/Documentation/arm/vlocks.txt | |||
@@ -0,0 +1,211 @@ | |||
1 | vlocks for Bare-Metal Mutual Exclusion | ||
2 | ====================================== | ||
3 | |||
4 | Voting Locks, or "vlocks" provide a simple low-level mutual exclusion | ||
5 | mechanism, with reasonable but minimal requirements on the memory | ||
6 | system. | ||
7 | |||
8 | These are intended to be used to coordinate critical activity among CPUs | ||
9 | which are otherwise non-coherent, in situations where the hardware | ||
10 | provides no other mechanism to support this and ordinary spinlocks | ||
11 | cannot be used. | ||
12 | |||
13 | |||
14 | vlocks make use of the atomicity provided by the memory system for | ||
15 | writes to a single memory location. To arbitrate, every CPU "votes for | ||
16 | itself", by storing a unique number to a common memory location. The | ||
17 | final value seen in that memory location when all the votes have been | ||
18 | cast identifies the winner. | ||
19 | |||
20 | In order to make sure that the election produces an unambiguous result | ||
21 | in finite time, a CPU will only enter the election in the first place if | ||
22 | no winner has been chosen and the election does not appear to have | ||
23 | started yet. | ||
24 | |||
25 | |||
26 | Algorithm | ||
27 | --------- | ||
28 | |||
29 | The easiest way to explain the vlocks algorithm is with some pseudo-code: | ||
30 | |||
31 | |||
32 | int currently_voting[NR_CPUS] = { 0, }; | ||
33 | int last_vote = -1; /* no votes yet */ | ||
34 | |||
35 | bool vlock_trylock(int this_cpu) | ||
36 | { | ||
37 | /* signal our desire to vote */ | ||
38 | currently_voting[this_cpu] = 1; | ||
39 | if (last_vote != -1) { | ||
40 | /* someone already volunteered himself */ | ||
41 | currently_voting[this_cpu] = 0; | ||
42 | return false; /* not ourself */ | ||
43 | } | ||
44 | |||
45 | /* let's suggest ourself */ | ||
46 | last_vote = this_cpu; | ||
47 | currently_voting[this_cpu] = 0; | ||
48 | |||
49 | /* then wait until everyone else is done voting */ | ||
50 | for_each_cpu(i) { | ||
51 | while (currently_voting[i] != 0) | ||
52 | /* wait */; | ||
53 | } | ||
54 | |||
55 | /* result */ | ||
56 | if (last_vote == this_cpu) | ||
57 | return true; /* we won */ | ||
58 | return false; | ||
59 | } | ||
60 | |||
61 | bool vlock_unlock(void) | ||
62 | { | ||
63 | last_vote = -1; | ||
64 | } | ||
65 | |||
66 | |||
67 | The currently_voting[] array provides a way for the CPUs to determine | ||
68 | whether an election is in progress, and plays a role analogous to the | ||
69 | "entering" array in Lamport's bakery algorithm [1]. | ||
70 | |||
71 | However, once the election has started, the underlying memory system | ||
72 | atomicity is used to pick the winner. This avoids the need for a static | ||
73 | priority rule to act as a tie-breaker, or any counters which could | ||
74 | overflow. | ||
75 | |||
76 | As long as the last_vote variable is globally visible to all CPUs, it | ||
77 | will contain only one value that won't change once every CPU has cleared | ||
78 | its currently_voting flag. | ||
79 | |||
80 | |||
81 | Features and limitations | ||
82 | ------------------------ | ||
83 | |||
84 | * vlocks are not intended to be fair. In the contended case, it is the | ||
85 | _last_ CPU which attempts to get the lock which will be most likely | ||
86 | to win. | ||
87 | |||
88 | vlocks are therefore best suited to situations where it is necessary | ||
89 | to pick a unique winner, but it does not matter which CPU actually | ||
90 | wins. | ||
91 | |||
92 | * Like other similar mechanisms, vlocks will not scale well to a large | ||
93 | number of CPUs. | ||
94 | |||
95 | vlocks can be cascaded in a voting hierarchy to permit better scaling | ||
96 | if necessary, as in the following hypothetical example for 4096 CPUs: | ||
97 | |||
98 | /* first level: local election */ | ||
99 | my_town = towns[(this_cpu >> 4) & 0xf]; | ||
100 | I_won = vlock_trylock(my_town, this_cpu & 0xf); | ||
101 | if (I_won) { | ||
102 | /* we won the town election, let's go for the state */ | ||
103 | my_state = states[(this_cpu >> 8) & 0xf]; | ||
104 | I_won = vlock_lock(my_state, this_cpu & 0xf)); | ||
105 | if (I_won) { | ||
106 | /* and so on */ | ||
107 | I_won = vlock_lock(the_whole_country, this_cpu & 0xf]; | ||
108 | if (I_won) { | ||
109 | /* ... */ | ||
110 | } | ||
111 | vlock_unlock(the_whole_country); | ||
112 | } | ||
113 | vlock_unlock(my_state); | ||
114 | } | ||
115 | vlock_unlock(my_town); | ||
116 | |||
117 | |||
118 | ARM implementation | ||
119 | ------------------ | ||
120 | |||
121 | The current ARM implementation [2] contains some optimisations beyond | ||
122 | the basic algorithm: | ||
123 | |||
124 | * By packing the members of the currently_voting array close together, | ||
125 | we can read the whole array in one transaction (providing the number | ||
126 | of CPUs potentially contending the lock is small enough). This | ||
127 | reduces the number of round-trips required to external memory. | ||
128 | |||
129 | In the ARM implementation, this means that we can use a single load | ||
130 | and comparison: | ||
131 | |||
132 | LDR Rt, [Rn] | ||
133 | CMP Rt, #0 | ||
134 | |||
135 | ...in place of code equivalent to: | ||
136 | |||
137 | LDRB Rt, [Rn] | ||
138 | CMP Rt, #0 | ||
139 | LDRBEQ Rt, [Rn, #1] | ||
140 | CMPEQ Rt, #0 | ||
141 | LDRBEQ Rt, [Rn, #2] | ||
142 | CMPEQ Rt, #0 | ||
143 | LDRBEQ Rt, [Rn, #3] | ||
144 | CMPEQ Rt, #0 | ||
145 | |||
146 | This cuts down on the fast-path latency, as well as potentially | ||
147 | reducing bus contention in contended cases. | ||
148 | |||
149 | The optimisation relies on the fact that the ARM memory system | ||
150 | guarantees coherency between overlapping memory accesses of | ||
151 | different sizes, similarly to many other architectures. Note that | ||
152 | we do not care which element of currently_voting appears in which | ||
153 | bits of Rt, so there is no need to worry about endianness in this | ||
154 | optimisation. | ||
155 | |||
156 | If there are too many CPUs to read the currently_voting array in | ||
157 | one transaction then multiple transations are still required. The | ||
158 | implementation uses a simple loop of word-sized loads for this | ||
159 | case. The number of transactions is still fewer than would be | ||
160 | required if bytes were loaded individually. | ||
161 | |||
162 | |||
163 | In principle, we could aggregate further by using LDRD or LDM, but | ||
164 | to keep the code simple this was not attempted in the initial | ||
165 | implementation. | ||
166 | |||
167 | |||
168 | * vlocks are currently only used to coordinate between CPUs which are | ||
169 | unable to enable their caches yet. This means that the | ||
170 | implementation removes many of the barriers which would be required | ||
171 | when executing the algorithm in cached memory. | ||
172 | |||
173 | packing of the currently_voting array does not work with cached | ||
174 | memory unless all CPUs contending the lock are cache-coherent, due | ||
175 | to cache writebacks from one CPU clobbering values written by other | ||
176 | CPUs. (Though if all the CPUs are cache-coherent, you should be | ||
177 | probably be using proper spinlocks instead anyway). | ||
178 | |||
179 | |||
180 | * The "no votes yet" value used for the last_vote variable is 0 (not | ||
181 | -1 as in the pseudocode). This allows statically-allocated vlocks | ||
182 | to be implicitly initialised to an unlocked state simply by putting | ||
183 | them in .bss. | ||
184 | |||
185 | An offset is added to each CPU's ID for the purpose of setting this | ||
186 | variable, so that no CPU uses the value 0 for its ID. | ||
187 | |||
188 | |||
189 | Colophon | ||
190 | -------- | ||
191 | |||
192 | Originally created and documented by Dave Martin for Linaro Limited, for | ||
193 | use in ARM-based big.LITTLE platforms, with review and input gratefully | ||
194 | received from Nicolas Pitre and Achin Gupta. Thanks to Nicolas for | ||
195 | grabbing most of this text out of the relevant mail thread and writing | ||
196 | up the pseudocode. | ||
197 | |||
198 | Copyright (C) 2012-2013 Linaro Limited | ||
199 | Distributed under the terms of Version 2 of the GNU General Public | ||
200 | License, as defined in linux/COPYING. | ||
201 | |||
202 | |||
203 | References | ||
204 | ---------- | ||
205 | |||
206 | [1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming | ||
207 | Problem", Communications of the ACM 17, 8 (August 1974), 453-455. | ||
208 | |||
209 | http://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm | ||
210 | |||
211 | [2] linux/arch/arm/common/vlock.S, www.kernel.org. | ||
diff --git a/arch/arm/common/vlock.S b/arch/arm/common/vlock.S new file mode 100644 index 000000000000..ff198583f683 --- /dev/null +++ b/arch/arm/common/vlock.S | |||
@@ -0,0 +1,108 @@ | |||
1 | /* | ||
2 | * vlock.S - simple voting lock implementation for ARM | ||
3 | * | ||
4 | * Created by: Dave Martin, 2012-08-16 | ||
5 | * Copyright: (C) 2012-2013 Linaro Limited | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * | ||
17 | * This algorithm is described in more detail in | ||
18 | * Documentation/arm/vlocks.txt. | ||
19 | */ | ||
20 | |||
21 | #include <linux/linkage.h> | ||
22 | #include "vlock.h" | ||
23 | |||
24 | /* Select different code if voting flags can fit in a single word. */ | ||
25 | #if VLOCK_VOTING_SIZE > 4 | ||
26 | #define FEW(x...) | ||
27 | #define MANY(x...) x | ||
28 | #else | ||
29 | #define FEW(x...) x | ||
30 | #define MANY(x...) | ||
31 | #endif | ||
32 | |||
33 | @ voting lock for first-man coordination | ||
34 | |||
35 | .macro voting_begin rbase:req, rcpu:req, rscratch:req | ||
36 | mov \rscratch, #1 | ||
37 | strb \rscratch, [\rbase, \rcpu] | ||
38 | dmb | ||
39 | .endm | ||
40 | |||
41 | .macro voting_end rbase:req, rcpu:req, rscratch:req | ||
42 | dmb | ||
43 | mov \rscratch, #0 | ||
44 | strb \rscratch, [\rbase, \rcpu] | ||
45 | dsb | ||
46 | sev | ||
47 | .endm | ||
48 | |||
49 | /* | ||
50 | * The vlock structure must reside in Strongly-Ordered or Device memory. | ||
51 | * This implementation deliberately eliminates most of the barriers which | ||
52 | * would be required for other memory types, and assumes that independent | ||
53 | * writes to neighbouring locations within a cacheline do not interfere | ||
54 | * with one another. | ||
55 | */ | ||
56 | |||
57 | @ r0: lock structure base | ||
58 | @ r1: CPU ID (0-based index within cluster) | ||
59 | ENTRY(vlock_trylock) | ||
60 | add r1, r1, #VLOCK_VOTING_OFFSET | ||
61 | |||
62 | voting_begin r0, r1, r2 | ||
63 | |||
64 | ldrb r2, [r0, #VLOCK_OWNER_OFFSET] @ check whether lock is held | ||
65 | cmp r2, #VLOCK_OWNER_NONE | ||
66 | bne trylock_fail @ fail if so | ||
67 | |||
68 | @ Control dependency implies strb not observable before previous ldrb. | ||
69 | |||
70 | strb r1, [r0, #VLOCK_OWNER_OFFSET] @ submit my vote | ||
71 | |||
72 | voting_end r0, r1, r2 @ implies DMB | ||
73 | |||
74 | @ Wait for the current round of voting to finish: | ||
75 | |||
76 | MANY( mov r3, #VLOCK_VOTING_OFFSET ) | ||
77 | 0: | ||
78 | MANY( ldr r2, [r0, r3] ) | ||
79 | FEW( ldr r2, [r0, #VLOCK_VOTING_OFFSET] ) | ||
80 | cmp r2, #0 | ||
81 | wfene | ||
82 | bne 0b | ||
83 | MANY( add r3, r3, #4 ) | ||
84 | MANY( cmp r3, #VLOCK_VOTING_OFFSET + VLOCK_VOTING_SIZE ) | ||
85 | MANY( bne 0b ) | ||
86 | |||
87 | @ Check who won: | ||
88 | |||
89 | dmb | ||
90 | ldrb r2, [r0, #VLOCK_OWNER_OFFSET] | ||
91 | eor r0, r1, r2 @ zero if I won, else nonzero | ||
92 | bx lr | ||
93 | |||
94 | trylock_fail: | ||
95 | voting_end r0, r1, r2 | ||
96 | mov r0, #1 @ nonzero indicates that I lost | ||
97 | bx lr | ||
98 | ENDPROC(vlock_trylock) | ||
99 | |||
100 | @ r0: lock structure base | ||
101 | ENTRY(vlock_unlock) | ||
102 | dmb | ||
103 | mov r1, #VLOCK_OWNER_NONE | ||
104 | strb r1, [r0, #VLOCK_OWNER_OFFSET] | ||
105 | dsb | ||
106 | sev | ||
107 | bx lr | ||
108 | ENDPROC(vlock_unlock) | ||
diff --git a/arch/arm/common/vlock.h b/arch/arm/common/vlock.h new file mode 100644 index 000000000000..3b441475a59b --- /dev/null +++ b/arch/arm/common/vlock.h | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * vlock.h - simple voting lock implementation | ||
3 | * | ||
4 | * Created by: Dave Martin, 2012-08-16 | ||
5 | * Copyright: (C) 2012-2013 Linaro Limited | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #ifndef __VLOCK_H | ||
18 | #define __VLOCK_H | ||
19 | |||
20 | #include <asm/mcpm.h> | ||
21 | |||
22 | /* Offsets and sizes are rounded to a word (4 bytes) */ | ||
23 | #define VLOCK_OWNER_OFFSET 0 | ||
24 | #define VLOCK_VOTING_OFFSET 4 | ||
25 | #define VLOCK_VOTING_SIZE ((MAX_CPUS_PER_CLUSTER + 3) / 4 * 4) | ||
26 | #define VLOCK_SIZE (VLOCK_VOTING_OFFSET + VLOCK_VOTING_SIZE) | ||
27 | #define VLOCK_OWNER_NONE 0 | ||
28 | |||
29 | #endif /* ! __VLOCK_H */ | ||