diff options
author | Philipp Reisner <philipp.reisner@linbit.com> | 2009-09-25 19:07:19 -0400 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2009-10-01 15:17:49 -0400 |
commit | b411b3637fa71fce9cf2acf0639009500f5892fe (patch) | |
tree | 6b88e5202e0f137fef50e95b0441bcafdbf91990 /include/linux/lru_cache.h | |
parent | 1a35e0f6443f4266dad4c569c55c57a9032596fa (diff) |
The DRBD driver
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'include/linux/lru_cache.h')
-rw-r--r-- | include/linux/lru_cache.h | 294 |
1 files changed, 294 insertions, 0 deletions
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h new file mode 100644 index 000000000000..3a2b2d9b0472 --- /dev/null +++ b/include/linux/lru_cache.h | |||
@@ -0,0 +1,294 @@ | |||
1 | /* | ||
2 | lru_cache.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #ifndef LRU_CACHE_H | ||
27 | #define LRU_CACHE_H | ||
28 | |||
29 | #include <linux/list.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/bitops.h> | ||
32 | #include <linux/string.h> /* for memset */ | ||
33 | #include <linux/seq_file.h> | ||
34 | |||
35 | /* | ||
36 | This header file (and its .c file; kernel-doc of functions see there) | ||
37 | define a helper framework to easily keep track of index:label associations, | ||
38 | and changes to an "active set" of objects, as well as pending transactions, | ||
39 | to persistently record those changes. | ||
40 | |||
41 | We use an LRU policy if it is necessary to "cool down" a region currently in | ||
42 | the active set before we can "heat" a previously unused region. | ||
43 | |||
44 | Because of this later property, it is called "lru_cache". | ||
45 | As it actually Tracks Objects in an Active SeT, we could also call it | ||
46 | toast (incidentally that is what may happen to the data on the | ||
47 | backend storage uppon next resync, if we don't get it right). | ||
48 | |||
49 | What for? | ||
50 | |||
51 | We replicate IO (more or less synchronously) to local and remote disk. | ||
52 | |||
53 | For crash recovery after replication node failure, | ||
54 | we need to resync all regions that have been target of in-flight WRITE IO | ||
55 | (in use, or "hot", regions), as we don't know wether or not those WRITEs have | ||
56 | made it to stable storage. | ||
57 | |||
58 | To avoid a "full resync", we need to persistently track these regions. | ||
59 | |||
60 | This is known as "write intent log", and can be implemented as on-disk | ||
61 | (coarse or fine grained) bitmap, or other meta data. | ||
62 | |||
63 | To avoid the overhead of frequent extra writes to this meta data area, | ||
64 | usually the condition is softened to regions that _may_ have been target of | ||
65 | in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent | ||
66 | bitmap, trading frequency of meta data transactions against amount of | ||
67 | (possibly unneccessary) resync traffic. | ||
68 | |||
69 | If we set a hard limit on the area that may be "hot" at any given time, we | ||
70 | limit the amount of resync traffic needed for crash recovery. | ||
71 | |||
72 | For recovery after replication link failure, | ||
73 | we need to resync all blocks that have been changed on the other replica | ||
74 | in the mean time, or, if both replica have been changed independently [*], | ||
75 | all blocks that have been changed on either replica in the mean time. | ||
76 | [*] usually as a result of a cluster split-brain and insufficient protection. | ||
77 | but there are valid use cases to do this on purpose. | ||
78 | |||
79 | Tracking those blocks can be implemented as "dirty bitmap". | ||
80 | Having it fine-grained reduces the amount of resync traffic. | ||
81 | It should also be persistent, to allow for reboots (or crashes) | ||
82 | while the replication link is down. | ||
83 | |||
84 | There are various possible implementations for persistently storing | ||
85 | write intent log information, three of which are mentioned here. | ||
86 | |||
87 | "Chunk dirtying" | ||
88 | The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well. | ||
89 | To reduce the frequency of bitmap updates for write-intent log purposes, | ||
90 | one could dirty "chunks" (of some size) at a time of the (fine grained) | ||
91 | on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as | ||
92 | possible, flushing it to disk again when a previously "hot" (and on-disk | ||
93 | dirtied as full chunk) area "cools down" again (no IO in flight anymore, | ||
94 | and none expected in the near future either). | ||
95 | |||
96 | "Explicit (coarse) write intent bitmap" | ||
97 | An other implementation could chose a (probably coarse) explicit bitmap, | ||
98 | for write-intent log purposes, additionally to the fine grained dirty bitmap. | ||
99 | |||
100 | "Activity log" | ||
101 | Yet an other implementation may keep track of the hot regions, by starting | ||
102 | with an empty set, and writing down a journal of region numbers that have | ||
103 | become "hot", or have "cooled down" again. | ||
104 | |||
105 | To be able to use a ring buffer for this journal of changes to the active | ||
106 | set, we not only record the actual changes to that set, but also record the | ||
107 | not changing members of the set in a round robin fashion. To do so, we use a | ||
108 | fixed (but configurable) number of slots which we can identify by index, and | ||
109 | associate region numbers (labels) with these indices. | ||
110 | For each transaction recording a change to the active set, we record the | ||
111 | change itself (index: -old_label, +new_label), and which index is associated | ||
112 | with which label (index: current_label) within a certain sliding window that | ||
113 | is moved further over the available indices with each such transaction. | ||
114 | |||
115 | Thus, for crash recovery, if the ringbuffer is sufficiently large, we can | ||
116 | accurately reconstruct the active set. | ||
117 | |||
118 | Sufficiently large depends only on maximum number of active objects, and the | ||
119 | size of the sliding window recording "index: current_label" associations within | ||
120 | each transaction. | ||
121 | |||
122 | This is what we call the "activity log". | ||
123 | |||
124 | Currently we need one activity log transaction per single label change, which | ||
125 | does not give much benefit over the "dirty chunks of bitmap" approach, other | ||
126 | than potentially less seeks. | ||
127 | |||
128 | We plan to change the transaction format to support multiple changes per | ||
129 | transaction, which then would reduce several (disjoint, "random") updates to | ||
130 | the bitmap into one transaction to the activity log ring buffer. | ||
131 | */ | ||
132 | |||
133 | /* this defines an element in a tracked set | ||
134 | * .colision is for hash table lookup. | ||
135 | * When we process a new IO request, we know its sector, thus can deduce the | ||
136 | * region number (label) easily. To do the label -> object lookup without a | ||
137 | * full list walk, we use a simple hash table. | ||
138 | * | ||
139 | * .list is on one of three lists: | ||
140 | * in_use: currently in use (refcnt > 0, lc_number != LC_FREE) | ||
141 | * lru: unused but ready to be reused or recycled | ||
142 | * (ts_refcnt == 0, lc_number != LC_FREE), | ||
143 | * free: unused but ready to be recycled | ||
144 | * (ts_refcnt == 0, lc_number == LC_FREE), | ||
145 | * | ||
146 | * an element is said to be "in the active set", | ||
147 | * if either on "in_use" or "lru", i.e. lc_number != LC_FREE. | ||
148 | * | ||
149 | * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache | ||
150 | * (total memory usage 2 pages), and up to 3833 elements on the act_log | ||
151 | * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages. | ||
152 | * | ||
153 | * We usually do not actually free these objects again, but only "recycle" | ||
154 | * them, as the change "index: -old_label, +LC_FREE" would need a transaction | ||
155 | * as well. Which also means that using a kmem_cache to allocate the objects | ||
156 | * from wastes some resources. | ||
157 | * But it avoids high order page allocations in kmalloc. | ||
158 | */ | ||
159 | struct lc_element { | ||
160 | struct hlist_node colision; | ||
161 | struct list_head list; /* LRU list or free list */ | ||
162 | unsigned refcnt; | ||
163 | /* back "pointer" into ts_cache->element[index], | ||
164 | * for paranoia, and for "ts_element_to_index" */ | ||
165 | unsigned lc_index; | ||
166 | /* if we want to track a larger set of objects, | ||
167 | * it needs to become arch independend u64 */ | ||
168 | unsigned lc_number; | ||
169 | |||
170 | /* special label when on free list */ | ||
171 | #define LC_FREE (~0U) | ||
172 | }; | ||
173 | |||
174 | struct lru_cache { | ||
175 | /* the least recently used item is kept at lru->prev */ | ||
176 | struct list_head lru; | ||
177 | struct list_head free; | ||
178 | struct list_head in_use; | ||
179 | |||
180 | /* the pre-created kmem cache to allocate the objects from */ | ||
181 | struct kmem_cache *lc_cache; | ||
182 | |||
183 | /* size of tracked objects, used to memset(,0,) them in lc_reset */ | ||
184 | size_t element_size; | ||
185 | /* offset of struct lc_element member in the tracked object */ | ||
186 | size_t element_off; | ||
187 | |||
188 | /* number of elements (indices) */ | ||
189 | unsigned int nr_elements; | ||
190 | /* Arbitrary limit on maximum tracked objects. Practical limit is much | ||
191 | * lower due to allocation failures, probably. For typical use cases, | ||
192 | * nr_elements should be a few thousand at most. | ||
193 | * This also limits the maximum value of ts_element.ts_index, allowing the | ||
194 | * 8 high bits of .ts_index to be overloaded with flags in the future. */ | ||
195 | #define LC_MAX_ACTIVE (1<<24) | ||
196 | |||
197 | /* statistics */ | ||
198 | unsigned used; /* number of lelements currently on in_use list */ | ||
199 | unsigned long hits, misses, starving, dirty, changed; | ||
200 | |||
201 | /* see below: flag-bits for lru_cache */ | ||
202 | unsigned long flags; | ||
203 | |||
204 | /* when changing the label of an index element */ | ||
205 | unsigned int new_number; | ||
206 | |||
207 | /* for paranoia when changing the label of an index element */ | ||
208 | struct lc_element *changing_element; | ||
209 | |||
210 | void *lc_private; | ||
211 | const char *name; | ||
212 | |||
213 | /* nr_elements there */ | ||
214 | struct hlist_head *lc_slot; | ||
215 | struct lc_element **lc_element; | ||
216 | }; | ||
217 | |||
218 | |||
219 | /* flag-bits for lru_cache */ | ||
220 | enum { | ||
221 | /* debugging aid, to catch concurrent access early. | ||
222 | * user needs to guarantee exclusive access by proper locking! */ | ||
223 | __LC_PARANOIA, | ||
224 | /* if we need to change the set, but currently there is a changing | ||
225 | * transaction pending, we are "dirty", and must deferr further | ||
226 | * changing requests */ | ||
227 | __LC_DIRTY, | ||
228 | /* if we need to change the set, but currently there is no free nor | ||
229 | * unused element available, we are "starving", and must not give out | ||
230 | * further references, to guarantee that eventually some refcnt will | ||
231 | * drop to zero and we will be able to make progress again, changing | ||
232 | * the set, writing the transaction. | ||
233 | * if the statistics say we are frequently starving, | ||
234 | * nr_elements is too small. */ | ||
235 | __LC_STARVING, | ||
236 | }; | ||
237 | #define LC_PARANOIA (1<<__LC_PARANOIA) | ||
238 | #define LC_DIRTY (1<<__LC_DIRTY) | ||
239 | #define LC_STARVING (1<<__LC_STARVING) | ||
240 | |||
241 | extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | ||
242 | unsigned e_count, size_t e_size, size_t e_off); | ||
243 | extern void lc_reset(struct lru_cache *lc); | ||
244 | extern void lc_destroy(struct lru_cache *lc); | ||
245 | extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); | ||
246 | extern void lc_del(struct lru_cache *lc, struct lc_element *element); | ||
247 | |||
248 | extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); | ||
249 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); | ||
250 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); | ||
251 | extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); | ||
252 | extern void lc_changed(struct lru_cache *lc, struct lc_element *e); | ||
253 | |||
254 | struct seq_file; | ||
255 | extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); | ||
256 | |||
257 | extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, | ||
258 | void (*detail) (struct seq_file *, struct lc_element *)); | ||
259 | |||
260 | /** | ||
261 | * lc_try_lock - can be used to stop lc_get() from changing the tracked set | ||
262 | * @lc: the lru cache to operate on | ||
263 | * | ||
264 | * Note that the reference counts and order on the active and lru lists may | ||
265 | * still change. Returns true if we aquired the lock. | ||
266 | */ | ||
267 | static inline int lc_try_lock(struct lru_cache *lc) | ||
268 | { | ||
269 | return !test_and_set_bit(__LC_DIRTY, &lc->flags); | ||
270 | } | ||
271 | |||
272 | /** | ||
273 | * lc_unlock - unlock @lc, allow lc_get() to change the set again | ||
274 | * @lc: the lru cache to operate on | ||
275 | */ | ||
276 | static inline void lc_unlock(struct lru_cache *lc) | ||
277 | { | ||
278 | clear_bit(__LC_DIRTY, &lc->flags); | ||
279 | smp_mb__after_clear_bit(); | ||
280 | } | ||
281 | |||
282 | static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) | ||
283 | { | ||
284 | struct lc_element *e = lc_find(lc, enr); | ||
285 | return e && e->refcnt; | ||
286 | } | ||
287 | |||
288 | #define lc_entry(ptr, type, member) \ | ||
289 | container_of(ptr, type, member) | ||
290 | |||
291 | extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); | ||
292 | extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); | ||
293 | |||
294 | #endif | ||