diff options
author | Heinz Mauelshagen <hjm@redhat.com> | 2008-10-21 12:45:06 -0400 |
---|---|---|
committer | Alasdair G Kergon <agk@redhat.com> | 2008-10-21 12:45:06 -0400 |
commit | 1f965b19437017cea6d3f3f46acdc5acae5fd011 (patch) | |
tree | f70fd0684d1afbde7f0031a6f8cb6aa58880723c /drivers/md | |
parent | f3e1d26ede3fb15c06904d700f1d7b21bba2215e (diff) |
dm raid1: separate region_hash interface part1
Separate the region hash code from raid1 so it can be shared by forthcoming
targets. Use BUG_ON() for failed async dm_io() calls.
Signed-off-by: Heinz Mauelshagen <hjm@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 789 | ||||
-rw-r--r-- | drivers/md/dm-region-hash.c | 704 |
3 files changed, 808 insertions, 687 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f1ef33dfd8cf..1c615804ea76 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -34,7 +34,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | |||
34 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 34 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
35 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 35 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
36 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 36 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
37 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o | 37 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
38 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 38 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
39 | 39 | ||
40 | quiet_cmd_unroll = UNROLL $@ | 40 | quiet_cmd_unroll = UNROLL $@ |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f358853af5cf..92dcc06832a4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -1,31 +1,30 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Sistina Software Limited. | 2 | * Copyright (C) 2003 Sistina Software Limited. |
3 | * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. | ||
3 | * | 4 | * |
4 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
5 | */ | 6 | */ |
6 | 7 | ||
7 | #include <linux/device-mapper.h> | ||
8 | |||
9 | #include "dm-bio-list.h" | 8 | #include "dm-bio-list.h" |
10 | #include "dm-bio-record.h" | 9 | #include "dm-bio-record.h" |
11 | 10 | ||
12 | #include <linux/ctype.h> | ||
13 | #include <linux/init.h> | 11 | #include <linux/init.h> |
14 | #include <linux/mempool.h> | 12 | #include <linux/mempool.h> |
15 | #include <linux/module.h> | 13 | #include <linux/module.h> |
16 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
17 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
18 | #include <linux/time.h> | ||
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/workqueue.h> | 16 | #include <linux/workqueue.h> |
21 | #include <linux/log2.h> | 17 | #include <linux/device-mapper.h> |
22 | #include <linux/hardirq.h> | ||
23 | #include <linux/dm-io.h> | 18 | #include <linux/dm-io.h> |
24 | #include <linux/dm-dirty-log.h> | 19 | #include <linux/dm-dirty-log.h> |
25 | #include <linux/dm-kcopyd.h> | 20 | #include <linux/dm-kcopyd.h> |
21 | #include <linux/dm-region-hash.h> | ||
26 | 22 | ||
27 | #define DM_MSG_PREFIX "raid1" | 23 | #define DM_MSG_PREFIX "raid1" |
24 | |||
25 | #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ | ||
28 | #define DM_IO_PAGES 64 | 26 | #define DM_IO_PAGES 64 |
27 | #define DM_KCOPYD_PAGES 64 | ||
29 | 28 | ||
30 | #define DM_RAID1_HANDLE_ERRORS 0x01 | 29 | #define DM_RAID1_HANDLE_ERRORS 0x01 |
31 | #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) | 30 | #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) |
@@ -33,87 +32,6 @@ | |||
33 | static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); | 32 | static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); |
34 | 33 | ||
35 | /*----------------------------------------------------------------- | 34 | /*----------------------------------------------------------------- |
36 | * Region hash | ||
37 | * | ||
38 | * The mirror splits itself up into discrete regions. Each | ||
39 | * region can be in one of three states: clean, dirty, | ||
40 | * nosync. There is no need to put clean regions in the hash. | ||
41 | * | ||
42 | * In addition to being present in the hash table a region _may_ | ||
43 | * be present on one of three lists. | ||
44 | * | ||
45 | * clean_regions: Regions on this list have no io pending to | ||
46 | * them, they are in sync, we are no longer interested in them, | ||
47 | * they are dull. rh_update_states() will remove them from the | ||
48 | * hash table. | ||
49 | * | ||
50 | * quiesced_regions: These regions have been spun down, ready | ||
51 | * for recovery. rh_recovery_start() will remove regions from | ||
52 | * this list and hand them to kmirrord, which will schedule the | ||
53 | * recovery io with kcopyd. | ||
54 | * | ||
55 | * recovered_regions: Regions that kcopyd has successfully | ||
56 | * recovered. rh_update_states() will now schedule any delayed | ||
57 | * io, up the recovery_count, and remove the region from the | ||
58 | * hash. | ||
59 | * | ||
60 | * There are 2 locks: | ||
61 | * A rw spin lock 'hash_lock' protects just the hash table, | ||
62 | * this is never held in write mode from interrupt context, | ||
63 | * which I believe means that we only have to disable irqs when | ||
64 | * doing a write lock. | ||
65 | * | ||
66 | * An ordinary spin lock 'region_lock' that protects the three | ||
67 | * lists in the region_hash, with the 'state', 'list' and | ||
68 | * 'bhs_delayed' fields of the regions. This is used from irq | ||
69 | * context, so all other uses will have to suspend local irqs. | ||
70 | *---------------------------------------------------------------*/ | ||
71 | struct mirror_set; | ||
72 | struct region_hash { | ||
73 | struct mirror_set *ms; | ||
74 | uint32_t region_size; | ||
75 | unsigned region_shift; | ||
76 | |||
77 | /* holds persistent region state */ | ||
78 | struct dm_dirty_log *log; | ||
79 | |||
80 | /* hash table */ | ||
81 | rwlock_t hash_lock; | ||
82 | mempool_t *region_pool; | ||
83 | unsigned int mask; | ||
84 | unsigned int nr_buckets; | ||
85 | struct list_head *buckets; | ||
86 | |||
87 | spinlock_t region_lock; | ||
88 | atomic_t recovery_in_flight; | ||
89 | struct semaphore recovery_count; | ||
90 | struct list_head clean_regions; | ||
91 | struct list_head quiesced_regions; | ||
92 | struct list_head recovered_regions; | ||
93 | struct list_head failed_recovered_regions; | ||
94 | }; | ||
95 | |||
96 | enum { | ||
97 | RH_CLEAN, | ||
98 | RH_DIRTY, | ||
99 | RH_NOSYNC, | ||
100 | RH_RECOVERING | ||
101 | }; | ||
102 | |||
103 | struct region { | ||
104 | struct region_hash *rh; /* FIXME: can we get rid of this ? */ | ||
105 | region_t key; | ||
106 | int state; | ||
107 | |||
108 | struct list_head hash_list; | ||
109 | struct list_head list; | ||
110 | |||
111 | atomic_t pending; | ||
112 | struct bio_list delayed_bios; | ||
113 | }; | ||
114 | |||
115 | |||
116 | /*----------------------------------------------------------------- | ||
117 | * Mirror set structures. | 35 | * Mirror set structures. |
118 | *---------------------------------------------------------------*/ | 36 | *---------------------------------------------------------------*/ |
119 | enum dm_raid1_error { | 37 | enum dm_raid1_error { |
@@ -133,8 +51,7 @@ struct mirror { | |||
133 | struct mirror_set { | 51 | struct mirror_set { |
134 | struct dm_target *ti; | 52 | struct dm_target *ti; |
135 | struct list_head list; | 53 | struct list_head list; |
136 | struct region_hash rh; | 54 | |
137 | struct dm_kcopyd_client *kcopyd_client; | ||
138 | uint64_t features; | 55 | uint64_t features; |
139 | 56 | ||
140 | spinlock_t lock; /* protects the lists */ | 57 | spinlock_t lock; /* protects the lists */ |
@@ -142,6 +59,8 @@ struct mirror_set { | |||
142 | struct bio_list writes; | 59 | struct bio_list writes; |
143 | struct bio_list failures; | 60 | struct bio_list failures; |
144 | 61 | ||
62 | struct dm_region_hash *rh; | ||
63 | struct dm_kcopyd_client *kcopyd_client; | ||
145 | struct dm_io_client *io_client; | 64 | struct dm_io_client *io_client; |
146 | mempool_t *read_record_pool; | 65 | mempool_t *read_record_pool; |
147 | 66 | ||
@@ -160,25 +79,14 @@ struct mirror_set { | |||
160 | 79 | ||
161 | struct work_struct trigger_event; | 80 | struct work_struct trigger_event; |
162 | 81 | ||
163 | unsigned int nr_mirrors; | 82 | unsigned nr_mirrors; |
164 | struct mirror mirror[0]; | 83 | struct mirror mirror[0]; |
165 | }; | 84 | }; |
166 | 85 | ||
167 | /* | 86 | static void wakeup_mirrord(void *context) |
168 | * Conversion fns | ||
169 | */ | ||
170 | static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) | ||
171 | { | 87 | { |
172 | return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift; | 88 | struct mirror_set *ms = context; |
173 | } | ||
174 | 89 | ||
175 | static inline sector_t region_to_sector(struct region_hash *rh, region_t region) | ||
176 | { | ||
177 | return region << rh->region_shift; | ||
178 | } | ||
179 | |||
180 | static void wake(struct mirror_set *ms) | ||
181 | { | ||
182 | queue_work(ms->kmirrord_wq, &ms->kmirrord_work); | 90 | queue_work(ms->kmirrord_wq, &ms->kmirrord_work); |
183 | } | 91 | } |
184 | 92 | ||
@@ -187,7 +95,7 @@ static void delayed_wake_fn(unsigned long data) | |||
187 | struct mirror_set *ms = (struct mirror_set *) data; | 95 | struct mirror_set *ms = (struct mirror_set *) data; |
188 | 96 | ||
189 | clear_bit(0, &ms->timer_pending); | 97 | clear_bit(0, &ms->timer_pending); |
190 | wake(ms); | 98 | wakeup_mirrord(ms); |
191 | } | 99 | } |
192 | 100 | ||
193 | static void delayed_wake(struct mirror_set *ms) | 101 | static void delayed_wake(struct mirror_set *ms) |
@@ -201,473 +109,34 @@ static void delayed_wake(struct mirror_set *ms) | |||
201 | add_timer(&ms->timer); | 109 | add_timer(&ms->timer); |
202 | } | 110 | } |
203 | 111 | ||
204 | /* FIXME move this */ | 112 | static void wakeup_all_recovery_waiters(void *context) |
205 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); | ||
206 | |||
207 | #define MIN_REGIONS 64 | ||
208 | #define MAX_RECOVERY 1 | ||
209 | static int rh_init(struct region_hash *rh, struct mirror_set *ms, | ||
210 | struct dm_dirty_log *log, uint32_t region_size, | ||
211 | region_t nr_regions) | ||
212 | { | ||
213 | unsigned int nr_buckets, max_buckets; | ||
214 | size_t i; | ||
215 | |||
216 | /* | ||
217 | * Calculate a suitable number of buckets for our hash | ||
218 | * table. | ||
219 | */ | ||
220 | max_buckets = nr_regions >> 6; | ||
221 | for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) | ||
222 | ; | ||
223 | nr_buckets >>= 1; | ||
224 | |||
225 | rh->ms = ms; | ||
226 | rh->log = log; | ||
227 | rh->region_size = region_size; | ||
228 | rh->region_shift = ffs(region_size) - 1; | ||
229 | rwlock_init(&rh->hash_lock); | ||
230 | rh->mask = nr_buckets - 1; | ||
231 | rh->nr_buckets = nr_buckets; | ||
232 | |||
233 | rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); | ||
234 | if (!rh->buckets) { | ||
235 | DMERR("unable to allocate region hash memory"); | ||
236 | return -ENOMEM; | ||
237 | } | ||
238 | |||
239 | for (i = 0; i < nr_buckets; i++) | ||
240 | INIT_LIST_HEAD(rh->buckets + i); | ||
241 | |||
242 | spin_lock_init(&rh->region_lock); | ||
243 | sema_init(&rh->recovery_count, 0); | ||
244 | atomic_set(&rh->recovery_in_flight, 0); | ||
245 | INIT_LIST_HEAD(&rh->clean_regions); | ||
246 | INIT_LIST_HEAD(&rh->quiesced_regions); | ||
247 | INIT_LIST_HEAD(&rh->recovered_regions); | ||
248 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | ||
249 | |||
250 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | ||
251 | sizeof(struct region)); | ||
252 | if (!rh->region_pool) { | ||
253 | vfree(rh->buckets); | ||
254 | rh->buckets = NULL; | ||
255 | return -ENOMEM; | ||
256 | } | ||
257 | |||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static void rh_exit(struct region_hash *rh) | ||
262 | { | ||
263 | unsigned int h; | ||
264 | struct region *reg, *nreg; | ||
265 | |||
266 | BUG_ON(!list_empty(&rh->quiesced_regions)); | ||
267 | for (h = 0; h < rh->nr_buckets; h++) { | ||
268 | list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { | ||
269 | BUG_ON(atomic_read(®->pending)); | ||
270 | mempool_free(reg, rh->region_pool); | ||
271 | } | ||
272 | } | ||
273 | |||
274 | if (rh->log) | ||
275 | dm_dirty_log_destroy(rh->log); | ||
276 | if (rh->region_pool) | ||
277 | mempool_destroy(rh->region_pool); | ||
278 | vfree(rh->buckets); | ||
279 | } | ||
280 | |||
281 | #define RH_HASH_MULT 2654435387U | ||
282 | |||
283 | static inline unsigned int rh_hash(struct region_hash *rh, region_t region) | ||
284 | { | ||
285 | return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; | ||
286 | } | ||
287 | |||
288 | static struct region *__rh_lookup(struct region_hash *rh, region_t region) | ||
289 | { | ||
290 | struct region *reg; | ||
291 | |||
292 | list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) | ||
293 | if (reg->key == region) | ||
294 | return reg; | ||
295 | |||
296 | return NULL; | ||
297 | } | ||
298 | |||
299 | static void __rh_insert(struct region_hash *rh, struct region *reg) | ||
300 | { | ||
301 | unsigned int h = rh_hash(rh, reg->key); | ||
302 | list_add(®->hash_list, rh->buckets + h); | ||
303 | } | ||
304 | |||
305 | static struct region *__rh_alloc(struct region_hash *rh, region_t region) | ||
306 | { | ||
307 | struct region *reg, *nreg; | ||
308 | |||
309 | read_unlock(&rh->hash_lock); | ||
310 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | ||
311 | if (unlikely(!nreg)) | ||
312 | nreg = kmalloc(sizeof(struct region), GFP_NOIO); | ||
313 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | ||
314 | RH_CLEAN : RH_NOSYNC; | ||
315 | nreg->rh = rh; | ||
316 | nreg->key = region; | ||
317 | |||
318 | INIT_LIST_HEAD(&nreg->list); | ||
319 | |||
320 | atomic_set(&nreg->pending, 0); | ||
321 | bio_list_init(&nreg->delayed_bios); | ||
322 | write_lock_irq(&rh->hash_lock); | ||
323 | |||
324 | reg = __rh_lookup(rh, region); | ||
325 | if (reg) | ||
326 | /* we lost the race */ | ||
327 | mempool_free(nreg, rh->region_pool); | ||
328 | |||
329 | else { | ||
330 | __rh_insert(rh, nreg); | ||
331 | if (nreg->state == RH_CLEAN) { | ||
332 | spin_lock(&rh->region_lock); | ||
333 | list_add(&nreg->list, &rh->clean_regions); | ||
334 | spin_unlock(&rh->region_lock); | ||
335 | } | ||
336 | reg = nreg; | ||
337 | } | ||
338 | write_unlock_irq(&rh->hash_lock); | ||
339 | read_lock(&rh->hash_lock); | ||
340 | |||
341 | return reg; | ||
342 | } | ||
343 | |||
344 | static inline struct region *__rh_find(struct region_hash *rh, region_t region) | ||
345 | { | ||
346 | struct region *reg; | ||
347 | |||
348 | reg = __rh_lookup(rh, region); | ||
349 | if (!reg) | ||
350 | reg = __rh_alloc(rh, region); | ||
351 | |||
352 | return reg; | ||
353 | } | ||
354 | |||
355 | static int rh_state(struct region_hash *rh, region_t region, int may_block) | ||
356 | { | ||
357 | int r; | ||
358 | struct region *reg; | ||
359 | |||
360 | read_lock(&rh->hash_lock); | ||
361 | reg = __rh_lookup(rh, region); | ||
362 | read_unlock(&rh->hash_lock); | ||
363 | |||
364 | if (reg) | ||
365 | return reg->state; | ||
366 | |||
367 | /* | ||
368 | * The region wasn't in the hash, so we fall back to the | ||
369 | * dirty log. | ||
370 | */ | ||
371 | r = rh->log->type->in_sync(rh->log, region, may_block); | ||
372 | |||
373 | /* | ||
374 | * Any error from the dirty log (eg. -EWOULDBLOCK) gets | ||
375 | * taken as a RH_NOSYNC | ||
376 | */ | ||
377 | return r == 1 ? RH_CLEAN : RH_NOSYNC; | ||
378 | } | ||
379 | |||
380 | static inline int rh_in_sync(struct region_hash *rh, | ||
381 | region_t region, int may_block) | ||
382 | { | ||
383 | int state = rh_state(rh, region, may_block); | ||
384 | return state == RH_CLEAN || state == RH_DIRTY; | ||
385 | } | ||
386 | |||
387 | static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) | ||
388 | { | ||
389 | struct bio *bio; | ||
390 | |||
391 | while ((bio = bio_list_pop(bio_list))) { | ||
392 | queue_bio(ms, bio, WRITE); | ||
393 | } | ||
394 | } | ||
395 | |||
396 | static void complete_resync_work(struct region *reg, int success) | ||
397 | { | ||
398 | struct region_hash *rh = reg->rh; | ||
399 | |||
400 | rh->log->type->set_region_sync(rh->log, reg->key, success); | ||
401 | |||
402 | /* | ||
403 | * Dispatch the bios before we call 'wake_up_all'. | ||
404 | * This is important because if we are suspending, | ||
405 | * we want to know that recovery is complete and | ||
406 | * the work queue is flushed. If we wake_up_all | ||
407 | * before we dispatch_bios (queue bios and call wake()), | ||
408 | * then we risk suspending before the work queue | ||
409 | * has been properly flushed. | ||
410 | */ | ||
411 | dispatch_bios(rh->ms, ®->delayed_bios); | ||
412 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | ||
413 | wake_up_all(&_kmirrord_recovery_stopped); | ||
414 | up(&rh->recovery_count); | ||
415 | } | ||
416 | |||
417 | static void rh_update_states(struct region_hash *rh) | ||
418 | { | ||
419 | struct region *reg, *next; | ||
420 | |||
421 | LIST_HEAD(clean); | ||
422 | LIST_HEAD(recovered); | ||
423 | LIST_HEAD(failed_recovered); | ||
424 | |||
425 | /* | ||
426 | * Quickly grab the lists. | ||
427 | */ | ||
428 | write_lock_irq(&rh->hash_lock); | ||
429 | spin_lock(&rh->region_lock); | ||
430 | if (!list_empty(&rh->clean_regions)) { | ||
431 | list_splice_init(&rh->clean_regions, &clean); | ||
432 | |||
433 | list_for_each_entry(reg, &clean, list) | ||
434 | list_del(®->hash_list); | ||
435 | } | ||
436 | |||
437 | if (!list_empty(&rh->recovered_regions)) { | ||
438 | list_splice_init(&rh->recovered_regions, &recovered); | ||
439 | |||
440 | list_for_each_entry (reg, &recovered, list) | ||
441 | list_del(®->hash_list); | ||
442 | } | ||
443 | |||
444 | if (!list_empty(&rh->failed_recovered_regions)) { | ||
445 | list_splice_init(&rh->failed_recovered_regions, | ||
446 | &failed_recovered); | ||
447 | |||
448 | list_for_each_entry(reg, &failed_recovered, list) | ||
449 | list_del(®->hash_list); | ||
450 | } | ||
451 | |||
452 | spin_unlock(&rh->region_lock); | ||
453 | write_unlock_irq(&rh->hash_lock); | ||
454 | |||
455 | /* | ||
456 | * All the regions on the recovered and clean lists have | ||
457 | * now been pulled out of the system, so no need to do | ||
458 | * any more locking. | ||
459 | */ | ||
460 | list_for_each_entry_safe (reg, next, &recovered, list) { | ||
461 | rh->log->type->clear_region(rh->log, reg->key); | ||
462 | complete_resync_work(reg, 1); | ||
463 | mempool_free(reg, rh->region_pool); | ||
464 | } | ||
465 | |||
466 | list_for_each_entry_safe(reg, next, &failed_recovered, list) { | ||
467 | complete_resync_work(reg, errors_handled(rh->ms) ? 0 : 1); | ||
468 | mempool_free(reg, rh->region_pool); | ||
469 | } | ||
470 | |||
471 | list_for_each_entry_safe(reg, next, &clean, list) { | ||
472 | rh->log->type->clear_region(rh->log, reg->key); | ||
473 | mempool_free(reg, rh->region_pool); | ||
474 | } | ||
475 | |||
476 | rh->log->type->flush(rh->log); | ||
477 | } | ||
478 | |||
479 | static void rh_inc(struct region_hash *rh, region_t region) | ||
480 | { | ||
481 | struct region *reg; | ||
482 | |||
483 | read_lock(&rh->hash_lock); | ||
484 | reg = __rh_find(rh, region); | ||
485 | |||
486 | spin_lock_irq(&rh->region_lock); | ||
487 | atomic_inc(®->pending); | ||
488 | |||
489 | if (reg->state == RH_CLEAN) { | ||
490 | reg->state = RH_DIRTY; | ||
491 | list_del_init(®->list); /* take off the clean list */ | ||
492 | spin_unlock_irq(&rh->region_lock); | ||
493 | |||
494 | rh->log->type->mark_region(rh->log, reg->key); | ||
495 | } else | ||
496 | spin_unlock_irq(&rh->region_lock); | ||
497 | |||
498 | |||
499 | read_unlock(&rh->hash_lock); | ||
500 | } | ||
501 | |||
502 | static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) | ||
503 | { | 113 | { |
504 | struct bio *bio; | 114 | wake_up_all(&_kmirrord_recovery_stopped); |
505 | |||
506 | for (bio = bios->head; bio; bio = bio->bi_next) | ||
507 | rh_inc(rh, bio_to_region(rh, bio)); | ||
508 | } | 115 | } |
509 | 116 | ||
510 | static void rh_dec(struct region_hash *rh, region_t region) | 117 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) |
511 | { | 118 | { |
512 | unsigned long flags; | 119 | unsigned long flags; |
513 | struct region *reg; | ||
514 | int should_wake = 0; | 120 | int should_wake = 0; |
121 | struct bio_list *bl; | ||
515 | 122 | ||
516 | read_lock(&rh->hash_lock); | 123 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; |
517 | reg = __rh_lookup(rh, region); | 124 | spin_lock_irqsave(&ms->lock, flags); |
518 | read_unlock(&rh->hash_lock); | 125 | should_wake = !(bl->head); |
519 | 126 | bio_list_add(bl, bio); | |
520 | spin_lock_irqsave(&rh->region_lock, flags); | 127 | spin_unlock_irqrestore(&ms->lock, flags); |
521 | if (atomic_dec_and_test(®->pending)) { | ||
522 | /* | ||
523 | * There is no pending I/O for this region. | ||
524 | * We can move the region to corresponding list for next action. | ||
525 | * At this point, the region is not yet connected to any list. | ||
526 | * | ||
527 | * If the state is RH_NOSYNC, the region should be kept off | ||
528 | * from clean list. | ||
529 | * The hash entry for RH_NOSYNC will remain in memory | ||
530 | * until the region is recovered or the map is reloaded. | ||
531 | */ | ||
532 | |||
533 | /* do nothing for RH_NOSYNC */ | ||
534 | if (reg->state == RH_RECOVERING) { | ||
535 | list_add_tail(®->list, &rh->quiesced_regions); | ||
536 | } else if (reg->state == RH_DIRTY) { | ||
537 | reg->state = RH_CLEAN; | ||
538 | list_add(®->list, &rh->clean_regions); | ||
539 | } | ||
540 | should_wake = 1; | ||
541 | } | ||
542 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
543 | 128 | ||
544 | if (should_wake) | 129 | if (should_wake) |
545 | wake(rh->ms); | 130 | wakeup_mirrord(ms); |
546 | } | 131 | } |
547 | 132 | ||
548 | /* | 133 | static void dispatch_bios(void *context, struct bio_list *bio_list) |
549 | * Starts quiescing a region in preparation for recovery. | ||
550 | */ | ||
551 | static int __rh_recovery_prepare(struct region_hash *rh) | ||
552 | { | 134 | { |
553 | int r; | 135 | struct mirror_set *ms = context; |
554 | struct region *reg; | 136 | struct bio *bio; |
555 | region_t region; | ||
556 | |||
557 | /* | ||
558 | * Ask the dirty log what's next. | ||
559 | */ | ||
560 | r = rh->log->type->get_resync_work(rh->log, ®ion); | ||
561 | if (r <= 0) | ||
562 | return r; | ||
563 | |||
564 | /* | ||
565 | * Get this region, and start it quiescing by setting the | ||
566 | * recovering flag. | ||
567 | */ | ||
568 | read_lock(&rh->hash_lock); | ||
569 | reg = __rh_find(rh, region); | ||
570 | read_unlock(&rh->hash_lock); | ||
571 | |||
572 | spin_lock_irq(&rh->region_lock); | ||
573 | reg->state = RH_RECOVERING; | ||
574 | |||
575 | /* Already quiesced ? */ | ||
576 | if (atomic_read(®->pending)) | ||
577 | list_del_init(®->list); | ||
578 | else | ||
579 | list_move(®->list, &rh->quiesced_regions); | ||
580 | |||
581 | spin_unlock_irq(&rh->region_lock); | ||
582 | |||
583 | return 1; | ||
584 | } | ||
585 | |||
586 | static void rh_recovery_prepare(struct region_hash *rh) | ||
587 | { | ||
588 | /* Extra reference to avoid race with rh_stop_recovery */ | ||
589 | atomic_inc(&rh->recovery_in_flight); | ||
590 | |||
591 | while (!down_trylock(&rh->recovery_count)) { | ||
592 | atomic_inc(&rh->recovery_in_flight); | ||
593 | if (__rh_recovery_prepare(rh) <= 0) { | ||
594 | atomic_dec(&rh->recovery_in_flight); | ||
595 | up(&rh->recovery_count); | ||
596 | break; | ||
597 | } | ||
598 | } | ||
599 | |||
600 | /* Drop the extra reference */ | ||
601 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | ||
602 | wake_up_all(&_kmirrord_recovery_stopped); | ||
603 | } | ||
604 | |||
605 | /* | ||
606 | * Returns any quiesced regions. | ||
607 | */ | ||
608 | static struct region *rh_recovery_start(struct region_hash *rh) | ||
609 | { | ||
610 | struct region *reg = NULL; | ||
611 | |||
612 | spin_lock_irq(&rh->region_lock); | ||
613 | if (!list_empty(&rh->quiesced_regions)) { | ||
614 | reg = list_entry(rh->quiesced_regions.next, | ||
615 | struct region, list); | ||
616 | list_del_init(®->list); /* remove from the quiesced list */ | ||
617 | } | ||
618 | spin_unlock_irq(&rh->region_lock); | ||
619 | |||
620 | return reg; | ||
621 | } | ||
622 | |||
623 | static void rh_recovery_end(struct region *reg, int success) | ||
624 | { | ||
625 | struct region_hash *rh = reg->rh; | ||
626 | |||
627 | spin_lock_irq(&rh->region_lock); | ||
628 | if (success) | ||
629 | list_add(®->list, ®->rh->recovered_regions); | ||
630 | else { | ||
631 | reg->state = RH_NOSYNC; | ||
632 | list_add(®->list, ®->rh->failed_recovered_regions); | ||
633 | } | ||
634 | spin_unlock_irq(&rh->region_lock); | ||
635 | |||
636 | wake(rh->ms); | ||
637 | } | ||
638 | |||
639 | static int rh_flush(struct region_hash *rh) | ||
640 | { | ||
641 | return rh->log->type->flush(rh->log); | ||
642 | } | ||
643 | |||
644 | static void rh_delay(struct region_hash *rh, struct bio *bio) | ||
645 | { | ||
646 | struct region *reg; | ||
647 | |||
648 | read_lock(&rh->hash_lock); | ||
649 | reg = __rh_find(rh, bio_to_region(rh, bio)); | ||
650 | bio_list_add(®->delayed_bios, bio); | ||
651 | read_unlock(&rh->hash_lock); | ||
652 | } | ||
653 | |||
654 | static void rh_stop_recovery(struct region_hash *rh) | ||
655 | { | ||
656 | int i; | ||
657 | |||
658 | /* wait for any recovering regions */ | ||
659 | for (i = 0; i < MAX_RECOVERY; i++) | ||
660 | down(&rh->recovery_count); | ||
661 | } | ||
662 | |||
663 | static void rh_start_recovery(struct region_hash *rh) | ||
664 | { | ||
665 | int i; | ||
666 | |||
667 | for (i = 0; i < MAX_RECOVERY; i++) | ||
668 | up(&rh->recovery_count); | ||
669 | 137 | ||
670 | wake(rh->ms); | 138 | while ((bio = bio_list_pop(bio_list))) |
139 | queue_bio(ms, bio, WRITE); | ||
671 | } | 140 | } |
672 | 141 | ||
673 | #define MIN_READ_RECORDS 20 | 142 | #define MIN_READ_RECORDS 20 |
@@ -777,8 +246,8 @@ out: | |||
777 | static void recovery_complete(int read_err, unsigned long write_err, | 246 | static void recovery_complete(int read_err, unsigned long write_err, |
778 | void *context) | 247 | void *context) |
779 | { | 248 | { |
780 | struct region *reg = (struct region *)context; | 249 | struct dm_region *reg = context; |
781 | struct mirror_set *ms = reg->rh->ms; | 250 | struct mirror_set *ms = dm_rh_region_context(reg); |
782 | int m, bit = 0; | 251 | int m, bit = 0; |
783 | 252 | ||
784 | if (read_err) { | 253 | if (read_err) { |
@@ -804,31 +273,33 @@ static void recovery_complete(int read_err, unsigned long write_err, | |||
804 | } | 273 | } |
805 | } | 274 | } |
806 | 275 | ||
807 | rh_recovery_end(reg, !(read_err || write_err)); | 276 | dm_rh_recovery_end(reg, !(read_err || write_err)); |
808 | } | 277 | } |
809 | 278 | ||
810 | static int recover(struct mirror_set *ms, struct region *reg) | 279 | static int recover(struct mirror_set *ms, struct dm_region *reg) |
811 | { | 280 | { |
812 | int r; | 281 | int r; |
813 | unsigned int i; | 282 | unsigned i; |
814 | struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; | 283 | struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; |
815 | struct mirror *m; | 284 | struct mirror *m; |
816 | unsigned long flags = 0; | 285 | unsigned long flags = 0; |
286 | region_t key = dm_rh_get_region_key(reg); | ||
287 | sector_t region_size = dm_rh_get_region_size(ms->rh); | ||
817 | 288 | ||
818 | /* fill in the source */ | 289 | /* fill in the source */ |
819 | m = get_default_mirror(ms); | 290 | m = get_default_mirror(ms); |
820 | from.bdev = m->dev->bdev; | 291 | from.bdev = m->dev->bdev; |
821 | from.sector = m->offset + region_to_sector(reg->rh, reg->key); | 292 | from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); |
822 | if (reg->key == (ms->nr_regions - 1)) { | 293 | if (key == (ms->nr_regions - 1)) { |
823 | /* | 294 | /* |
824 | * The final region may be smaller than | 295 | * The final region may be smaller than |
825 | * region_size. | 296 | * region_size. |
826 | */ | 297 | */ |
827 | from.count = ms->ti->len & (reg->rh->region_size - 1); | 298 | from.count = ms->ti->len & (region_size - 1); |
828 | if (!from.count) | 299 | if (!from.count) |
829 | from.count = reg->rh->region_size; | 300 | from.count = region_size; |
830 | } else | 301 | } else |
831 | from.count = reg->rh->region_size; | 302 | from.count = region_size; |
832 | 303 | ||
833 | /* fill in the destinations */ | 304 | /* fill in the destinations */ |
834 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { | 305 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { |
@@ -837,7 +308,7 @@ static int recover(struct mirror_set *ms, struct region *reg) | |||
837 | 308 | ||
838 | m = ms->mirror + i; | 309 | m = ms->mirror + i; |
839 | dest->bdev = m->dev->bdev; | 310 | dest->bdev = m->dev->bdev; |
840 | dest->sector = m->offset + region_to_sector(reg->rh, reg->key); | 311 | dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); |
841 | dest->count = from.count; | 312 | dest->count = from.count; |
842 | dest++; | 313 | dest++; |
843 | } | 314 | } |
@@ -854,22 +325,22 @@ static int recover(struct mirror_set *ms, struct region *reg) | |||
854 | 325 | ||
855 | static void do_recovery(struct mirror_set *ms) | 326 | static void do_recovery(struct mirror_set *ms) |
856 | { | 327 | { |
328 | struct dm_region *reg; | ||
329 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | ||
857 | int r; | 330 | int r; |
858 | struct region *reg; | ||
859 | struct dm_dirty_log *log = ms->rh.log; | ||
860 | 331 | ||
861 | /* | 332 | /* |
862 | * Start quiescing some regions. | 333 | * Start quiescing some regions. |
863 | */ | 334 | */ |
864 | rh_recovery_prepare(&ms->rh); | 335 | dm_rh_recovery_prepare(ms->rh); |
865 | 336 | ||
866 | /* | 337 | /* |
867 | * Copy any already quiesced regions. | 338 | * Copy any already quiesced regions. |
868 | */ | 339 | */ |
869 | while ((reg = rh_recovery_start(&ms->rh))) { | 340 | while ((reg = dm_rh_recovery_start(ms->rh))) { |
870 | r = recover(ms, reg); | 341 | r = recover(ms, reg); |
871 | if (r) | 342 | if (r) |
872 | rh_recovery_end(reg, 0); | 343 | dm_rh_recovery_end(reg, 0); |
873 | } | 344 | } |
874 | 345 | ||
875 | /* | 346 | /* |
@@ -910,9 +381,10 @@ static int default_ok(struct mirror *m) | |||
910 | 381 | ||
911 | static int mirror_available(struct mirror_set *ms, struct bio *bio) | 382 | static int mirror_available(struct mirror_set *ms, struct bio *bio) |
912 | { | 383 | { |
913 | region_t region = bio_to_region(&ms->rh, bio); | 384 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
385 | region_t region = dm_rh_bio_to_region(ms->rh, bio); | ||
914 | 386 | ||
915 | if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) | 387 | if (log->type->in_sync(log, region, 0)) |
916 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; | 388 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; |
917 | 389 | ||
918 | return 0; | 390 | return 0; |
@@ -986,7 +458,14 @@ static void read_async_bio(struct mirror *m, struct bio *bio) | |||
986 | 458 | ||
987 | map_region(&io, m, bio); | 459 | map_region(&io, m, bio); |
988 | bio_set_m(bio, m); | 460 | bio_set_m(bio, m); |
989 | (void) dm_io(&io_req, 1, &io, NULL); | 461 | BUG_ON(dm_io(&io_req, 1, &io, NULL)); |
462 | } | ||
463 | |||
464 | static inline int region_in_sync(struct mirror_set *ms, region_t region, | ||
465 | int may_block) | ||
466 | { | ||
467 | int state = dm_rh_get_state(ms->rh, region, may_block); | ||
468 | return state == DM_RH_CLEAN || state == DM_RH_DIRTY; | ||
990 | } | 469 | } |
991 | 470 | ||
992 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) | 471 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) |
@@ -996,13 +475,13 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
996 | struct mirror *m; | 475 | struct mirror *m; |
997 | 476 | ||
998 | while ((bio = bio_list_pop(reads))) { | 477 | while ((bio = bio_list_pop(reads))) { |
999 | region = bio_to_region(&ms->rh, bio); | 478 | region = dm_rh_bio_to_region(ms->rh, bio); |
1000 | m = get_default_mirror(ms); | 479 | m = get_default_mirror(ms); |
1001 | 480 | ||
1002 | /* | 481 | /* |
1003 | * We can only read balance if the region is in sync. | 482 | * We can only read balance if the region is in sync. |
1004 | */ | 483 | */ |
1005 | if (likely(rh_in_sync(&ms->rh, region, 1))) | 484 | if (likely(region_in_sync(ms, region, 1))) |
1006 | m = choose_mirror(ms, bio->bi_sector); | 485 | m = choose_mirror(ms, bio->bi_sector); |
1007 | else if (m && atomic_read(&m->error_count)) | 486 | else if (m && atomic_read(&m->error_count)) |
1008 | m = NULL; | 487 | m = NULL; |
@@ -1025,57 +504,6 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
1025 | * NOSYNC: increment pending, just write to the default mirror | 504 | * NOSYNC: increment pending, just write to the default mirror |
1026 | *---------------------------------------------------------------*/ | 505 | *---------------------------------------------------------------*/ |
1027 | 506 | ||
1028 | /* __bio_mark_nosync | ||
1029 | * @ms | ||
1030 | * @bio | ||
1031 | * @done | ||
1032 | * @error | ||
1033 | * | ||
1034 | * The bio was written on some mirror(s) but failed on other mirror(s). | ||
1035 | * We can successfully endio the bio but should avoid the region being | ||
1036 | * marked clean by setting the state RH_NOSYNC. | ||
1037 | * | ||
1038 | * This function is _not_ safe in interrupt context! | ||
1039 | */ | ||
1040 | static void __bio_mark_nosync(struct mirror_set *ms, | ||
1041 | struct bio *bio, unsigned done, int error) | ||
1042 | { | ||
1043 | unsigned long flags; | ||
1044 | struct region_hash *rh = &ms->rh; | ||
1045 | struct dm_dirty_log *log = ms->rh.log; | ||
1046 | struct region *reg; | ||
1047 | region_t region = bio_to_region(rh, bio); | ||
1048 | int recovering = 0; | ||
1049 | |||
1050 | /* We must inform the log that the sync count has changed. */ | ||
1051 | log->type->set_region_sync(log, region, 0); | ||
1052 | ms->in_sync = 0; | ||
1053 | |||
1054 | read_lock(&rh->hash_lock); | ||
1055 | reg = __rh_find(rh, region); | ||
1056 | read_unlock(&rh->hash_lock); | ||
1057 | |||
1058 | /* region hash entry should exist because write was in-flight */ | ||
1059 | BUG_ON(!reg); | ||
1060 | BUG_ON(!list_empty(®->list)); | ||
1061 | |||
1062 | spin_lock_irqsave(&rh->region_lock, flags); | ||
1063 | /* | ||
1064 | * Possible cases: | ||
1065 | * 1) RH_DIRTY | ||
1066 | * 2) RH_NOSYNC: was dirty, other preceeding writes failed | ||
1067 | * 3) RH_RECOVERING: flushing pending writes | ||
1068 | * Either case, the region should have not been connected to list. | ||
1069 | */ | ||
1070 | recovering = (reg->state == RH_RECOVERING); | ||
1071 | reg->state = RH_NOSYNC; | ||
1072 | BUG_ON(!list_empty(®->list)); | ||
1073 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
1074 | |||
1075 | bio_endio(bio, error); | ||
1076 | if (recovering) | ||
1077 | complete_resync_work(reg, 0); | ||
1078 | } | ||
1079 | 507 | ||
1080 | static void write_callback(unsigned long error, void *context) | 508 | static void write_callback(unsigned long error, void *context) |
1081 | { | 509 | { |
@@ -1120,7 +548,7 @@ static void write_callback(unsigned long error, void *context) | |||
1120 | bio_list_add(&ms->failures, bio); | 548 | bio_list_add(&ms->failures, bio); |
1121 | spin_unlock_irqrestore(&ms->lock, flags); | 549 | spin_unlock_irqrestore(&ms->lock, flags); |
1122 | if (should_wake) | 550 | if (should_wake) |
1123 | wake(ms); | 551 | wakeup_mirrord(ms); |
1124 | return; | 552 | return; |
1125 | } | 553 | } |
1126 | out: | 554 | out: |
@@ -1150,7 +578,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
1150 | */ | 578 | */ |
1151 | bio_set_m(bio, get_default_mirror(ms)); | 579 | bio_set_m(bio, get_default_mirror(ms)); |
1152 | 580 | ||
1153 | (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); | 581 | BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); |
1154 | } | 582 | } |
1155 | 583 | ||
1156 | static void do_writes(struct mirror_set *ms, struct bio_list *writes) | 584 | static void do_writes(struct mirror_set *ms, struct bio_list *writes) |
@@ -1170,18 +598,19 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
1170 | bio_list_init(&recover); | 598 | bio_list_init(&recover); |
1171 | 599 | ||
1172 | while ((bio = bio_list_pop(writes))) { | 600 | while ((bio = bio_list_pop(writes))) { |
1173 | state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); | 601 | state = dm_rh_get_state(ms->rh, |
602 | dm_rh_bio_to_region(ms->rh, bio), 1); | ||
1174 | switch (state) { | 603 | switch (state) { |
1175 | case RH_CLEAN: | 604 | case DM_RH_CLEAN: |
1176 | case RH_DIRTY: | 605 | case DM_RH_DIRTY: |
1177 | this_list = &sync; | 606 | this_list = &sync; |
1178 | break; | 607 | break; |
1179 | 608 | ||
1180 | case RH_NOSYNC: | 609 | case DM_RH_NOSYNC: |
1181 | this_list = &nosync; | 610 | this_list = &nosync; |
1182 | break; | 611 | break; |
1183 | 612 | ||
1184 | case RH_RECOVERING: | 613 | case DM_RH_RECOVERING: |
1185 | this_list = &recover; | 614 | this_list = &recover; |
1186 | break; | 615 | break; |
1187 | } | 616 | } |
@@ -1194,9 +623,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
1194 | * be written to (writes to recover regions are going to | 623 | * be written to (writes to recover regions are going to |
1195 | * be delayed). | 624 | * be delayed). |
1196 | */ | 625 | */ |
1197 | rh_inc_pending(&ms->rh, &sync); | 626 | dm_rh_inc_pending(ms->rh, &sync); |
1198 | rh_inc_pending(&ms->rh, &nosync); | 627 | dm_rh_inc_pending(ms->rh, &nosync); |
1199 | ms->log_failure = rh_flush(&ms->rh) ? 1 : 0; | 628 | ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0; |
1200 | 629 | ||
1201 | /* | 630 | /* |
1202 | * Dispatch io. | 631 | * Dispatch io. |
@@ -1205,13 +634,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
1205 | spin_lock_irq(&ms->lock); | 634 | spin_lock_irq(&ms->lock); |
1206 | bio_list_merge(&ms->failures, &sync); | 635 | bio_list_merge(&ms->failures, &sync); |
1207 | spin_unlock_irq(&ms->lock); | 636 | spin_unlock_irq(&ms->lock); |
1208 | wake(ms); | 637 | wakeup_mirrord(ms); |
1209 | } else | 638 | } else |
1210 | while ((bio = bio_list_pop(&sync))) | 639 | while ((bio = bio_list_pop(&sync))) |
1211 | do_write(ms, bio); | 640 | do_write(ms, bio); |
1212 | 641 | ||
1213 | while ((bio = bio_list_pop(&recover))) | 642 | while ((bio = bio_list_pop(&recover))) |
1214 | rh_delay(&ms->rh, bio); | 643 | dm_rh_delay(ms->rh, bio); |
1215 | 644 | ||
1216 | while ((bio = bio_list_pop(&nosync))) { | 645 | while ((bio = bio_list_pop(&nosync))) { |
1217 | map_bio(get_default_mirror(ms), bio); | 646 | map_bio(get_default_mirror(ms), bio); |
@@ -1228,7 +657,8 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
1228 | 657 | ||
1229 | if (!ms->log_failure) { | 658 | if (!ms->log_failure) { |
1230 | while ((bio = bio_list_pop(failures))) | 659 | while ((bio = bio_list_pop(failures))) |
1231 | __bio_mark_nosync(ms, bio, bio->bi_size, 0); | 660 | ms->in_sync = 0; |
661 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); | ||
1232 | return; | 662 | return; |
1233 | } | 663 | } |
1234 | 664 | ||
@@ -1281,8 +711,8 @@ static void trigger_event(struct work_struct *work) | |||
1281 | *---------------------------------------------------------------*/ | 711 | *---------------------------------------------------------------*/ |
1282 | static void do_mirror(struct work_struct *work) | 712 | static void do_mirror(struct work_struct *work) |
1283 | { | 713 | { |
1284 | struct mirror_set *ms =container_of(work, struct mirror_set, | 714 | struct mirror_set *ms = container_of(work, struct mirror_set, |
1285 | kmirrord_work); | 715 | kmirrord_work); |
1286 | struct bio_list reads, writes, failures; | 716 | struct bio_list reads, writes, failures; |
1287 | unsigned long flags; | 717 | unsigned long flags; |
1288 | 718 | ||
@@ -1295,7 +725,7 @@ static void do_mirror(struct work_struct *work) | |||
1295 | bio_list_init(&ms->failures); | 725 | bio_list_init(&ms->failures); |
1296 | spin_unlock_irqrestore(&ms->lock, flags); | 726 | spin_unlock_irqrestore(&ms->lock, flags); |
1297 | 727 | ||
1298 | rh_update_states(&ms->rh); | 728 | dm_rh_update_states(ms->rh, errors_handled(ms)); |
1299 | do_recovery(ms); | 729 | do_recovery(ms); |
1300 | do_reads(ms, &reads); | 730 | do_reads(ms, &reads); |
1301 | do_writes(ms, &writes); | 731 | do_writes(ms, &writes); |
@@ -1304,7 +734,6 @@ static void do_mirror(struct work_struct *work) | |||
1304 | dm_table_unplug_all(ms->ti->table); | 734 | dm_table_unplug_all(ms->ti->table); |
1305 | } | 735 | } |
1306 | 736 | ||
1307 | |||
1308 | /*----------------------------------------------------------------- | 737 | /*----------------------------------------------------------------- |
1309 | * Target functions | 738 | * Target functions |
1310 | *---------------------------------------------------------------*/ | 739 | *---------------------------------------------------------------*/ |
@@ -1351,7 +780,11 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
1351 | return NULL; | 780 | return NULL; |
1352 | } | 781 | } |
1353 | 782 | ||
1354 | if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { | 783 | ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, |
784 | wakeup_all_recovery_waiters, | ||
785 | ms->ti->begin, MAX_RECOVERY, | ||
786 | dl, region_size, ms->nr_regions); | ||
787 | if (IS_ERR(ms->rh)) { | ||
1355 | ti->error = "Error creating dirty region hash"; | 788 | ti->error = "Error creating dirty region hash"; |
1356 | dm_io_client_destroy(ms->io_client); | 789 | dm_io_client_destroy(ms->io_client); |
1357 | mempool_destroy(ms->read_record_pool); | 790 | mempool_destroy(ms->read_record_pool); |
@@ -1369,7 +802,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, | |||
1369 | dm_put_device(ti, ms->mirror[m].dev); | 802 | dm_put_device(ti, ms->mirror[m].dev); |
1370 | 803 | ||
1371 | dm_io_client_destroy(ms->io_client); | 804 | dm_io_client_destroy(ms->io_client); |
1372 | rh_exit(&ms->rh); | 805 | dm_region_hash_destroy(ms->rh); |
1373 | mempool_destroy(ms->read_record_pool); | 806 | mempool_destroy(ms->read_record_pool); |
1374 | kfree(ms); | 807 | kfree(ms); |
1375 | } | 808 | } |
@@ -1409,10 +842,10 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
1409 | * Create dirty log: log_type #log_params <log_params> | 842 | * Create dirty log: log_type #log_params <log_params> |
1410 | */ | 843 | */ |
1411 | static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | 844 | static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, |
1412 | unsigned int argc, char **argv, | 845 | unsigned argc, char **argv, |
1413 | unsigned int *args_used) | 846 | unsigned *args_used) |
1414 | { | 847 | { |
1415 | unsigned int param_count; | 848 | unsigned param_count; |
1416 | struct dm_dirty_log *dl; | 849 | struct dm_dirty_log *dl; |
1417 | 850 | ||
1418 | if (argc < 2) { | 851 | if (argc < 2) { |
@@ -1543,7 +976,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1543 | } | 976 | } |
1544 | 977 | ||
1545 | ti->private = ms; | 978 | ti->private = ms; |
1546 | ti->split_io = ms->rh.region_size; | 979 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1547 | 980 | ||
1548 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 981 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); |
1549 | if (!ms->kmirrord_wq) { | 982 | if (!ms->kmirrord_wq) { |
@@ -1578,11 +1011,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1578 | goto err_destroy_wq; | 1011 | goto err_destroy_wq; |
1579 | } | 1012 | } |
1580 | 1013 | ||
1581 | r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); | 1014 | r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); |
1582 | if (r) | 1015 | if (r) |
1583 | goto err_destroy_wq; | 1016 | goto err_destroy_wq; |
1584 | 1017 | ||
1585 | wake(ms); | 1018 | wakeup_mirrord(ms); |
1586 | return 0; | 1019 | return 0; |
1587 | 1020 | ||
1588 | err_destroy_wq: | 1021 | err_destroy_wq: |
@@ -1603,22 +1036,6 @@ static void mirror_dtr(struct dm_target *ti) | |||
1603 | free_context(ms, ti, ms->nr_mirrors); | 1036 | free_context(ms, ti, ms->nr_mirrors); |
1604 | } | 1037 | } |
1605 | 1038 | ||
1606 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) | ||
1607 | { | ||
1608 | unsigned long flags; | ||
1609 | int should_wake = 0; | ||
1610 | struct bio_list *bl; | ||
1611 | |||
1612 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; | ||
1613 | spin_lock_irqsave(&ms->lock, flags); | ||
1614 | should_wake = !(bl->head); | ||
1615 | bio_list_add(bl, bio); | ||
1616 | spin_unlock_irqrestore(&ms->lock, flags); | ||
1617 | |||
1618 | if (should_wake) | ||
1619 | wake(ms); | ||
1620 | } | ||
1621 | |||
1622 | /* | 1039 | /* |
1623 | * Mirror mapping function | 1040 | * Mirror mapping function |
1624 | */ | 1041 | */ |
@@ -1629,16 +1046,16 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, | |||
1629 | struct mirror *m; | 1046 | struct mirror *m; |
1630 | struct mirror_set *ms = ti->private; | 1047 | struct mirror_set *ms = ti->private; |
1631 | struct dm_raid1_read_record *read_record = NULL; | 1048 | struct dm_raid1_read_record *read_record = NULL; |
1049 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | ||
1632 | 1050 | ||
1633 | if (rw == WRITE) { | 1051 | if (rw == WRITE) { |
1634 | /* Save region for mirror_end_io() handler */ | 1052 | /* Save region for mirror_end_io() handler */ |
1635 | map_context->ll = bio_to_region(&ms->rh, bio); | 1053 | map_context->ll = dm_rh_bio_to_region(ms->rh, bio); |
1636 | queue_bio(ms, bio, rw); | 1054 | queue_bio(ms, bio, rw); |
1637 | return DM_MAPIO_SUBMITTED; | 1055 | return DM_MAPIO_SUBMITTED; |
1638 | } | 1056 | } |
1639 | 1057 | ||
1640 | r = ms->rh.log->type->in_sync(ms->rh.log, | 1058 | r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); |
1641 | bio_to_region(&ms->rh, bio), 0); | ||
1642 | if (r < 0 && r != -EWOULDBLOCK) | 1059 | if (r < 0 && r != -EWOULDBLOCK) |
1643 | return r; | 1060 | return r; |
1644 | 1061 | ||
@@ -1686,7 +1103,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1686 | * We need to dec pending if this was a write. | 1103 | * We need to dec pending if this was a write. |
1687 | */ | 1104 | */ |
1688 | if (rw == WRITE) { | 1105 | if (rw == WRITE) { |
1689 | rh_dec(&ms->rh, map_context->ll); | 1106 | dm_rh_dec(ms->rh, map_context->ll); |
1690 | return error; | 1107 | return error; |
1691 | } | 1108 | } |
1692 | 1109 | ||
@@ -1742,7 +1159,7 @@ out: | |||
1742 | static void mirror_presuspend(struct dm_target *ti) | 1159 | static void mirror_presuspend(struct dm_target *ti) |
1743 | { | 1160 | { |
1744 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1161 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1745 | struct dm_dirty_log *log = ms->rh.log; | 1162 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1746 | 1163 | ||
1747 | atomic_set(&ms->suspend, 1); | 1164 | atomic_set(&ms->suspend, 1); |
1748 | 1165 | ||
@@ -1750,10 +1167,10 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1750 | * We must finish up all the work that we've | 1167 | * We must finish up all the work that we've |
1751 | * generated (i.e. recovery work). | 1168 | * generated (i.e. recovery work). |
1752 | */ | 1169 | */ |
1753 | rh_stop_recovery(&ms->rh); | 1170 | dm_rh_stop_recovery(ms->rh); |
1754 | 1171 | ||
1755 | wait_event(_kmirrord_recovery_stopped, | 1172 | wait_event(_kmirrord_recovery_stopped, |
1756 | !atomic_read(&ms->rh.recovery_in_flight)); | 1173 | !dm_rh_recovery_in_flight(ms->rh)); |
1757 | 1174 | ||
1758 | if (log->type->presuspend && log->type->presuspend(log)) | 1175 | if (log->type->presuspend && log->type->presuspend(log)) |
1759 | /* FIXME: need better error handling */ | 1176 | /* FIXME: need better error handling */ |
@@ -1771,7 +1188,7 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1771 | static void mirror_postsuspend(struct dm_target *ti) | 1188 | static void mirror_postsuspend(struct dm_target *ti) |
1772 | { | 1189 | { |
1773 | struct mirror_set *ms = ti->private; | 1190 | struct mirror_set *ms = ti->private; |
1774 | struct dm_dirty_log *log = ms->rh.log; | 1191 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1775 | 1192 | ||
1776 | if (log->type->postsuspend && log->type->postsuspend(log)) | 1193 | if (log->type->postsuspend && log->type->postsuspend(log)) |
1777 | /* FIXME: need better error handling */ | 1194 | /* FIXME: need better error handling */ |
@@ -1781,13 +1198,13 @@ static void mirror_postsuspend(struct dm_target *ti) | |||
1781 | static void mirror_resume(struct dm_target *ti) | 1198 | static void mirror_resume(struct dm_target *ti) |
1782 | { | 1199 | { |
1783 | struct mirror_set *ms = ti->private; | 1200 | struct mirror_set *ms = ti->private; |
1784 | struct dm_dirty_log *log = ms->rh.log; | 1201 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1785 | 1202 | ||
1786 | atomic_set(&ms->suspend, 0); | 1203 | atomic_set(&ms->suspend, 0); |
1787 | if (log->type->resume && log->type->resume(log)) | 1204 | if (log->type->resume && log->type->resume(log)) |
1788 | /* FIXME: need better error handling */ | 1205 | /* FIXME: need better error handling */ |
1789 | DMWARN("log resume failed"); | 1206 | DMWARN("log resume failed"); |
1790 | rh_start_recovery(&ms->rh); | 1207 | dm_rh_start_recovery(ms->rh); |
1791 | } | 1208 | } |
1792 | 1209 | ||
1793 | /* | 1210 | /* |
@@ -1819,7 +1236,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
1819 | { | 1236 | { |
1820 | unsigned int m, sz = 0; | 1237 | unsigned int m, sz = 0; |
1821 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1238 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1822 | struct dm_dirty_log *log = ms->rh.log; | 1239 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1823 | char buffer[ms->nr_mirrors + 1]; | 1240 | char buffer[ms->nr_mirrors + 1]; |
1824 | 1241 | ||
1825 | switch (type) { | 1242 | switch (type) { |
@@ -1832,15 +1249,15 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
1832 | buffer[m] = '\0'; | 1249 | buffer[m] = '\0'; |
1833 | 1250 | ||
1834 | DMEMIT("%llu/%llu 1 %s ", | 1251 | DMEMIT("%llu/%llu 1 %s ", |
1835 | (unsigned long long)log->type->get_sync_count(ms->rh.log), | 1252 | (unsigned long long)log->type->get_sync_count(log), |
1836 | (unsigned long long)ms->nr_regions, buffer); | 1253 | (unsigned long long)ms->nr_regions, buffer); |
1837 | 1254 | ||
1838 | sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); | 1255 | sz += log->type->status(log, type, result+sz, maxlen-sz); |
1839 | 1256 | ||
1840 | break; | 1257 | break; |
1841 | 1258 | ||
1842 | case STATUSTYPE_TABLE: | 1259 | case STATUSTYPE_TABLE: |
1843 | sz = log->type->status(ms->rh.log, type, result, maxlen); | 1260 | sz = log->type->status(log, type, result, maxlen); |
1844 | 1261 | ||
1845 | DMEMIT("%d", ms->nr_mirrors); | 1262 | DMEMIT("%d", ms->nr_mirrors); |
1846 | for (m = 0; m < ms->nr_mirrors; m++) | 1263 | for (m = 0; m < ms->nr_mirrors; m++) |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c new file mode 100644 index 000000000000..59f8d9df9e1a --- /dev/null +++ b/drivers/md/dm-region-hash.c | |||
@@ -0,0 +1,704 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software Limited. | ||
3 | * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include <linux/dm-dirty-log.h> | ||
9 | #include <linux/dm-region-hash.h> | ||
10 | |||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/vmalloc.h> | ||
15 | |||
16 | #include "dm.h" | ||
17 | #include "dm-bio-list.h" | ||
18 | |||
19 | #define DM_MSG_PREFIX "region hash" | ||
20 | |||
21 | /*----------------------------------------------------------------- | ||
22 | * Region hash | ||
23 | * | ||
24 | * The mirror splits itself up into discrete regions. Each | ||
25 | * region can be in one of three states: clean, dirty, | ||
26 | * nosync. There is no need to put clean regions in the hash. | ||
27 | * | ||
28 | * In addition to being present in the hash table a region _may_ | ||
29 | * be present on one of three lists. | ||
30 | * | ||
31 | * clean_regions: Regions on this list have no io pending to | ||
32 | * them, they are in sync, we are no longer interested in them, | ||
33 | * they are dull. dm_rh_update_states() will remove them from the | ||
34 | * hash table. | ||
35 | * | ||
36 | * quiesced_regions: These regions have been spun down, ready | ||
37 | * for recovery. rh_recovery_start() will remove regions from | ||
38 | * this list and hand them to kmirrord, which will schedule the | ||
39 | * recovery io with kcopyd. | ||
40 | * | ||
41 | * recovered_regions: Regions that kcopyd has successfully | ||
42 | * recovered. dm_rh_update_states() will now schedule any delayed | ||
43 | * io, up the recovery_count, and remove the region from the | ||
44 | * hash. | ||
45 | * | ||
46 | * There are 2 locks: | ||
47 | * A rw spin lock 'hash_lock' protects just the hash table, | ||
48 | * this is never held in write mode from interrupt context, | ||
49 | * which I believe means that we only have to disable irqs when | ||
50 | * doing a write lock. | ||
51 | * | ||
52 | * An ordinary spin lock 'region_lock' that protects the three | ||
53 | * lists in the region_hash, with the 'state', 'list' and | ||
54 | * 'delayed_bios' fields of the regions. This is used from irq | ||
55 | * context, so all other uses will have to suspend local irqs. | ||
56 | *---------------------------------------------------------------*/ | ||
57 | struct dm_region_hash { | ||
58 | uint32_t region_size; | ||
59 | unsigned region_shift; | ||
60 | |||
61 | /* holds persistent region state */ | ||
62 | struct dm_dirty_log *log; | ||
63 | |||
64 | /* hash table */ | ||
65 | rwlock_t hash_lock; | ||
66 | mempool_t *region_pool; | ||
67 | unsigned mask; | ||
68 | unsigned nr_buckets; | ||
69 | unsigned prime; | ||
70 | unsigned shift; | ||
71 | struct list_head *buckets; | ||
72 | |||
73 | unsigned max_recovery; /* Max # of regions to recover in parallel */ | ||
74 | |||
75 | spinlock_t region_lock; | ||
76 | atomic_t recovery_in_flight; | ||
77 | struct semaphore recovery_count; | ||
78 | struct list_head clean_regions; | ||
79 | struct list_head quiesced_regions; | ||
80 | struct list_head recovered_regions; | ||
81 | struct list_head failed_recovered_regions; | ||
82 | |||
83 | void *context; | ||
84 | sector_t target_begin; | ||
85 | |||
86 | /* Callback function to schedule bios writes */ | ||
87 | void (*dispatch_bios)(void *context, struct bio_list *bios); | ||
88 | |||
89 | /* Callback function to wakeup callers worker thread. */ | ||
90 | void (*wakeup_workers)(void *context); | ||
91 | |||
92 | /* Callback function to wakeup callers recovery waiters. */ | ||
93 | void (*wakeup_all_recovery_waiters)(void *context); | ||
94 | }; | ||
95 | |||
96 | struct dm_region { | ||
97 | struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ | ||
98 | region_t key; | ||
99 | int state; | ||
100 | |||
101 | struct list_head hash_list; | ||
102 | struct list_head list; | ||
103 | |||
104 | atomic_t pending; | ||
105 | struct bio_list delayed_bios; | ||
106 | }; | ||
107 | |||
108 | /* | ||
109 | * Conversion fns | ||
110 | */ | ||
111 | static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) | ||
112 | { | ||
113 | return sector >> rh->region_shift; | ||
114 | } | ||
115 | |||
116 | sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) | ||
117 | { | ||
118 | return region << rh->region_shift; | ||
119 | } | ||
120 | EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); | ||
121 | |||
122 | region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) | ||
123 | { | ||
124 | return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); | ||
125 | } | ||
126 | EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); | ||
127 | |||
128 | void *dm_rh_region_context(struct dm_region *reg) | ||
129 | { | ||
130 | return reg->rh->context; | ||
131 | } | ||
132 | EXPORT_SYMBOL_GPL(dm_rh_region_context); | ||
133 | |||
134 | region_t dm_rh_get_region_key(struct dm_region *reg) | ||
135 | { | ||
136 | return reg->key; | ||
137 | } | ||
138 | EXPORT_SYMBOL_GPL(dm_rh_get_region_key); | ||
139 | |||
140 | sector_t dm_rh_get_region_size(struct dm_region_hash *rh) | ||
141 | { | ||
142 | return rh->region_size; | ||
143 | } | ||
144 | EXPORT_SYMBOL_GPL(dm_rh_get_region_size); | ||
145 | |||
146 | /* | ||
147 | * FIXME: shall we pass in a structure instead of all these args to | ||
148 | * dm_region_hash_create()???? | ||
149 | */ | ||
150 | #define RH_HASH_MULT 2654435387U | ||
151 | #define RH_HASH_SHIFT 12 | ||
152 | |||
153 | #define MIN_REGIONS 64 | ||
154 | struct dm_region_hash *dm_region_hash_create( | ||
155 | void *context, void (*dispatch_bios)(void *context, | ||
156 | struct bio_list *bios), | ||
157 | void (*wakeup_workers)(void *context), | ||
158 | void (*wakeup_all_recovery_waiters)(void *context), | ||
159 | sector_t target_begin, unsigned max_recovery, | ||
160 | struct dm_dirty_log *log, uint32_t region_size, | ||
161 | region_t nr_regions) | ||
162 | { | ||
163 | struct dm_region_hash *rh; | ||
164 | unsigned nr_buckets, max_buckets; | ||
165 | size_t i; | ||
166 | |||
167 | /* | ||
168 | * Calculate a suitable number of buckets for our hash | ||
169 | * table. | ||
170 | */ | ||
171 | max_buckets = nr_regions >> 6; | ||
172 | for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) | ||
173 | ; | ||
174 | nr_buckets >>= 1; | ||
175 | |||
176 | rh = kmalloc(sizeof(*rh), GFP_KERNEL); | ||
177 | if (!rh) { | ||
178 | DMERR("unable to allocate region hash memory"); | ||
179 | return ERR_PTR(-ENOMEM); | ||
180 | } | ||
181 | |||
182 | rh->context = context; | ||
183 | rh->dispatch_bios = dispatch_bios; | ||
184 | rh->wakeup_workers = wakeup_workers; | ||
185 | rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; | ||
186 | rh->target_begin = target_begin; | ||
187 | rh->max_recovery = max_recovery; | ||
188 | rh->log = log; | ||
189 | rh->region_size = region_size; | ||
190 | rh->region_shift = ffs(region_size) - 1; | ||
191 | rwlock_init(&rh->hash_lock); | ||
192 | rh->mask = nr_buckets - 1; | ||
193 | rh->nr_buckets = nr_buckets; | ||
194 | |||
195 | rh->shift = RH_HASH_SHIFT; | ||
196 | rh->prime = RH_HASH_MULT; | ||
197 | |||
198 | rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); | ||
199 | if (!rh->buckets) { | ||
200 | DMERR("unable to allocate region hash bucket memory"); | ||
201 | kfree(rh); | ||
202 | return ERR_PTR(-ENOMEM); | ||
203 | } | ||
204 | |||
205 | for (i = 0; i < nr_buckets; i++) | ||
206 | INIT_LIST_HEAD(rh->buckets + i); | ||
207 | |||
208 | spin_lock_init(&rh->region_lock); | ||
209 | sema_init(&rh->recovery_count, 0); | ||
210 | atomic_set(&rh->recovery_in_flight, 0); | ||
211 | INIT_LIST_HEAD(&rh->clean_regions); | ||
212 | INIT_LIST_HEAD(&rh->quiesced_regions); | ||
213 | INIT_LIST_HEAD(&rh->recovered_regions); | ||
214 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | ||
215 | |||
216 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | ||
217 | sizeof(struct dm_region)); | ||
218 | if (!rh->region_pool) { | ||
219 | vfree(rh->buckets); | ||
220 | kfree(rh); | ||
221 | rh = ERR_PTR(-ENOMEM); | ||
222 | } | ||
223 | |||
224 | return rh; | ||
225 | } | ||
226 | EXPORT_SYMBOL_GPL(dm_region_hash_create); | ||
227 | |||
228 | void dm_region_hash_destroy(struct dm_region_hash *rh) | ||
229 | { | ||
230 | unsigned h; | ||
231 | struct dm_region *reg, *nreg; | ||
232 | |||
233 | BUG_ON(!list_empty(&rh->quiesced_regions)); | ||
234 | for (h = 0; h < rh->nr_buckets; h++) { | ||
235 | list_for_each_entry_safe(reg, nreg, rh->buckets + h, | ||
236 | hash_list) { | ||
237 | BUG_ON(atomic_read(®->pending)); | ||
238 | mempool_free(reg, rh->region_pool); | ||
239 | } | ||
240 | } | ||
241 | |||
242 | if (rh->log) | ||
243 | dm_dirty_log_destroy(rh->log); | ||
244 | |||
245 | if (rh->region_pool) | ||
246 | mempool_destroy(rh->region_pool); | ||
247 | |||
248 | vfree(rh->buckets); | ||
249 | kfree(rh); | ||
250 | } | ||
251 | EXPORT_SYMBOL_GPL(dm_region_hash_destroy); | ||
252 | |||
253 | struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) | ||
254 | { | ||
255 | return rh->log; | ||
256 | } | ||
257 | EXPORT_SYMBOL_GPL(dm_rh_dirty_log); | ||
258 | |||
259 | static unsigned rh_hash(struct dm_region_hash *rh, region_t region) | ||
260 | { | ||
261 | return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; | ||
262 | } | ||
263 | |||
264 | static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) | ||
265 | { | ||
266 | struct dm_region *reg; | ||
267 | struct list_head *bucket = rh->buckets + rh_hash(rh, region); | ||
268 | |||
269 | list_for_each_entry(reg, bucket, hash_list) | ||
270 | if (reg->key == region) | ||
271 | return reg; | ||
272 | |||
273 | return NULL; | ||
274 | } | ||
275 | |||
276 | static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) | ||
277 | { | ||
278 | list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); | ||
279 | } | ||
280 | |||
281 | static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) | ||
282 | { | ||
283 | struct dm_region *reg, *nreg; | ||
284 | |||
285 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | ||
286 | if (unlikely(!nreg)) | ||
287 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO); | ||
288 | |||
289 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | ||
290 | DM_RH_CLEAN : DM_RH_NOSYNC; | ||
291 | nreg->rh = rh; | ||
292 | nreg->key = region; | ||
293 | INIT_LIST_HEAD(&nreg->list); | ||
294 | atomic_set(&nreg->pending, 0); | ||
295 | bio_list_init(&nreg->delayed_bios); | ||
296 | |||
297 | write_lock_irq(&rh->hash_lock); | ||
298 | reg = __rh_lookup(rh, region); | ||
299 | if (reg) | ||
300 | /* We lost the race. */ | ||
301 | mempool_free(nreg, rh->region_pool); | ||
302 | else { | ||
303 | __rh_insert(rh, nreg); | ||
304 | if (nreg->state == DM_RH_CLEAN) { | ||
305 | spin_lock(&rh->region_lock); | ||
306 | list_add(&nreg->list, &rh->clean_regions); | ||
307 | spin_unlock(&rh->region_lock); | ||
308 | } | ||
309 | |||
310 | reg = nreg; | ||
311 | } | ||
312 | write_unlock_irq(&rh->hash_lock); | ||
313 | |||
314 | return reg; | ||
315 | } | ||
316 | |||
317 | static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) | ||
318 | { | ||
319 | struct dm_region *reg; | ||
320 | |||
321 | reg = __rh_lookup(rh, region); | ||
322 | if (!reg) { | ||
323 | read_unlock(&rh->hash_lock); | ||
324 | reg = __rh_alloc(rh, region); | ||
325 | read_lock(&rh->hash_lock); | ||
326 | } | ||
327 | |||
328 | return reg; | ||
329 | } | ||
330 | |||
331 | int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) | ||
332 | { | ||
333 | int r; | ||
334 | struct dm_region *reg; | ||
335 | |||
336 | read_lock(&rh->hash_lock); | ||
337 | reg = __rh_lookup(rh, region); | ||
338 | read_unlock(&rh->hash_lock); | ||
339 | |||
340 | if (reg) | ||
341 | return reg->state; | ||
342 | |||
343 | /* | ||
344 | * The region wasn't in the hash, so we fall back to the | ||
345 | * dirty log. | ||
346 | */ | ||
347 | r = rh->log->type->in_sync(rh->log, region, may_block); | ||
348 | |||
349 | /* | ||
350 | * Any error from the dirty log (eg. -EWOULDBLOCK) gets | ||
351 | * taken as a DM_RH_NOSYNC | ||
352 | */ | ||
353 | return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; | ||
354 | } | ||
355 | EXPORT_SYMBOL_GPL(dm_rh_get_state); | ||
356 | |||
357 | static void complete_resync_work(struct dm_region *reg, int success) | ||
358 | { | ||
359 | struct dm_region_hash *rh = reg->rh; | ||
360 | |||
361 | rh->log->type->set_region_sync(rh->log, reg->key, success); | ||
362 | |||
363 | /* | ||
364 | * Dispatch the bios before we call 'wake_up_all'. | ||
365 | * This is important because if we are suspending, | ||
366 | * we want to know that recovery is complete and | ||
367 | * the work queue is flushed. If we wake_up_all | ||
368 | * before we dispatch_bios (queue bios and call wake()), | ||
369 | * then we risk suspending before the work queue | ||
370 | * has been properly flushed. | ||
371 | */ | ||
372 | rh->dispatch_bios(rh->context, ®->delayed_bios); | ||
373 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | ||
374 | rh->wakeup_all_recovery_waiters(rh->context); | ||
375 | up(&rh->recovery_count); | ||
376 | } | ||
377 | |||
378 | /* dm_rh_mark_nosync | ||
379 | * @ms | ||
380 | * @bio | ||
381 | * @done | ||
382 | * @error | ||
383 | * | ||
384 | * The bio was written on some mirror(s) but failed on other mirror(s). | ||
385 | * We can successfully endio the bio but should avoid the region being | ||
386 | * marked clean by setting the state DM_RH_NOSYNC. | ||
387 | * | ||
388 | * This function is _not_ safe in interrupt context! | ||
389 | */ | ||
390 | void dm_rh_mark_nosync(struct dm_region_hash *rh, | ||
391 | struct bio *bio, unsigned done, int error) | ||
392 | { | ||
393 | unsigned long flags; | ||
394 | struct dm_dirty_log *log = rh->log; | ||
395 | struct dm_region *reg; | ||
396 | region_t region = dm_rh_bio_to_region(rh, bio); | ||
397 | int recovering = 0; | ||
398 | |||
399 | /* We must inform the log that the sync count has changed. */ | ||
400 | log->type->set_region_sync(log, region, 0); | ||
401 | |||
402 | read_lock(&rh->hash_lock); | ||
403 | reg = __rh_find(rh, region); | ||
404 | read_unlock(&rh->hash_lock); | ||
405 | |||
406 | /* region hash entry should exist because write was in-flight */ | ||
407 | BUG_ON(!reg); | ||
408 | BUG_ON(!list_empty(®->list)); | ||
409 | |||
410 | spin_lock_irqsave(&rh->region_lock, flags); | ||
411 | /* | ||
412 | * Possible cases: | ||
413 | * 1) DM_RH_DIRTY | ||
414 | * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed | ||
415 | * 3) DM_RH_RECOVERING: flushing pending writes | ||
416 | * Either case, the region should have not been connected to list. | ||
417 | */ | ||
418 | recovering = (reg->state == DM_RH_RECOVERING); | ||
419 | reg->state = DM_RH_NOSYNC; | ||
420 | BUG_ON(!list_empty(®->list)); | ||
421 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
422 | |||
423 | bio_endio(bio, error); | ||
424 | if (recovering) | ||
425 | complete_resync_work(reg, 0); | ||
426 | } | ||
427 | EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); | ||
428 | |||
429 | void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) | ||
430 | { | ||
431 | struct dm_region *reg, *next; | ||
432 | |||
433 | LIST_HEAD(clean); | ||
434 | LIST_HEAD(recovered); | ||
435 | LIST_HEAD(failed_recovered); | ||
436 | |||
437 | /* | ||
438 | * Quickly grab the lists. | ||
439 | */ | ||
440 | write_lock_irq(&rh->hash_lock); | ||
441 | spin_lock(&rh->region_lock); | ||
442 | if (!list_empty(&rh->clean_regions)) { | ||
443 | list_splice_init(&rh->clean_regions, &clean); | ||
444 | |||
445 | list_for_each_entry(reg, &clean, list) | ||
446 | list_del(®->hash_list); | ||
447 | } | ||
448 | |||
449 | if (!list_empty(&rh->recovered_regions)) { | ||
450 | list_splice_init(&rh->recovered_regions, &recovered); | ||
451 | |||
452 | list_for_each_entry(reg, &recovered, list) | ||
453 | list_del(®->hash_list); | ||
454 | } | ||
455 | |||
456 | if (!list_empty(&rh->failed_recovered_regions)) { | ||
457 | list_splice_init(&rh->failed_recovered_regions, | ||
458 | &failed_recovered); | ||
459 | |||
460 | list_for_each_entry(reg, &failed_recovered, list) | ||
461 | list_del(®->hash_list); | ||
462 | } | ||
463 | |||
464 | spin_unlock(&rh->region_lock); | ||
465 | write_unlock_irq(&rh->hash_lock); | ||
466 | |||
467 | /* | ||
468 | * All the regions on the recovered and clean lists have | ||
469 | * now been pulled out of the system, so no need to do | ||
470 | * any more locking. | ||
471 | */ | ||
472 | list_for_each_entry_safe(reg, next, &recovered, list) { | ||
473 | rh->log->type->clear_region(rh->log, reg->key); | ||
474 | complete_resync_work(reg, 1); | ||
475 | mempool_free(reg, rh->region_pool); | ||
476 | } | ||
477 | |||
478 | list_for_each_entry_safe(reg, next, &failed_recovered, list) { | ||
479 | complete_resync_work(reg, errors_handled ? 0 : 1); | ||
480 | mempool_free(reg, rh->region_pool); | ||
481 | } | ||
482 | |||
483 | list_for_each_entry_safe(reg, next, &clean, list) { | ||
484 | rh->log->type->clear_region(rh->log, reg->key); | ||
485 | mempool_free(reg, rh->region_pool); | ||
486 | } | ||
487 | |||
488 | rh->log->type->flush(rh->log); | ||
489 | } | ||
490 | EXPORT_SYMBOL_GPL(dm_rh_update_states); | ||
491 | |||
492 | static void rh_inc(struct dm_region_hash *rh, region_t region) | ||
493 | { | ||
494 | struct dm_region *reg; | ||
495 | |||
496 | read_lock(&rh->hash_lock); | ||
497 | reg = __rh_find(rh, region); | ||
498 | |||
499 | spin_lock_irq(&rh->region_lock); | ||
500 | atomic_inc(®->pending); | ||
501 | |||
502 | if (reg->state == DM_RH_CLEAN) { | ||
503 | reg->state = DM_RH_DIRTY; | ||
504 | list_del_init(®->list); /* take off the clean list */ | ||
505 | spin_unlock_irq(&rh->region_lock); | ||
506 | |||
507 | rh->log->type->mark_region(rh->log, reg->key); | ||
508 | } else | ||
509 | spin_unlock_irq(&rh->region_lock); | ||
510 | |||
511 | |||
512 | read_unlock(&rh->hash_lock); | ||
513 | } | ||
514 | |||
515 | void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | ||
516 | { | ||
517 | struct bio *bio; | ||
518 | |||
519 | for (bio = bios->head; bio; bio = bio->bi_next) | ||
520 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | ||
521 | } | ||
522 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); | ||
523 | |||
524 | void dm_rh_dec(struct dm_region_hash *rh, region_t region) | ||
525 | { | ||
526 | unsigned long flags; | ||
527 | struct dm_region *reg; | ||
528 | int should_wake = 0; | ||
529 | |||
530 | read_lock(&rh->hash_lock); | ||
531 | reg = __rh_lookup(rh, region); | ||
532 | read_unlock(&rh->hash_lock); | ||
533 | |||
534 | spin_lock_irqsave(&rh->region_lock, flags); | ||
535 | if (atomic_dec_and_test(®->pending)) { | ||
536 | /* | ||
537 | * There is no pending I/O for this region. | ||
538 | * We can move the region to corresponding list for next action. | ||
539 | * At this point, the region is not yet connected to any list. | ||
540 | * | ||
541 | * If the state is DM_RH_NOSYNC, the region should be kept off | ||
542 | * from clean list. | ||
543 | * The hash entry for DM_RH_NOSYNC will remain in memory | ||
544 | * until the region is recovered or the map is reloaded. | ||
545 | */ | ||
546 | |||
547 | /* do nothing for DM_RH_NOSYNC */ | ||
548 | if (reg->state == DM_RH_RECOVERING) { | ||
549 | list_add_tail(®->list, &rh->quiesced_regions); | ||
550 | } else if (reg->state == DM_RH_DIRTY) { | ||
551 | reg->state = DM_RH_CLEAN; | ||
552 | list_add(®->list, &rh->clean_regions); | ||
553 | } | ||
554 | should_wake = 1; | ||
555 | } | ||
556 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
557 | |||
558 | if (should_wake) | ||
559 | rh->wakeup_workers(rh->context); | ||
560 | } | ||
561 | EXPORT_SYMBOL_GPL(dm_rh_dec); | ||
562 | |||
563 | /* | ||
564 | * Starts quiescing a region in preparation for recovery. | ||
565 | */ | ||
566 | static int __rh_recovery_prepare(struct dm_region_hash *rh) | ||
567 | { | ||
568 | int r; | ||
569 | region_t region; | ||
570 | struct dm_region *reg; | ||
571 | |||
572 | /* | ||
573 | * Ask the dirty log what's next. | ||
574 | */ | ||
575 | r = rh->log->type->get_resync_work(rh->log, ®ion); | ||
576 | if (r <= 0) | ||
577 | return r; | ||
578 | |||
579 | /* | ||
580 | * Get this region, and start it quiescing by setting the | ||
581 | * recovering flag. | ||
582 | */ | ||
583 | read_lock(&rh->hash_lock); | ||
584 | reg = __rh_find(rh, region); | ||
585 | read_unlock(&rh->hash_lock); | ||
586 | |||
587 | spin_lock_irq(&rh->region_lock); | ||
588 | reg->state = DM_RH_RECOVERING; | ||
589 | |||
590 | /* Already quiesced ? */ | ||
591 | if (atomic_read(®->pending)) | ||
592 | list_del_init(®->list); | ||
593 | else | ||
594 | list_move(®->list, &rh->quiesced_regions); | ||
595 | |||
596 | spin_unlock_irq(&rh->region_lock); | ||
597 | |||
598 | return 1; | ||
599 | } | ||
600 | |||
601 | void dm_rh_recovery_prepare(struct dm_region_hash *rh) | ||
602 | { | ||
603 | /* Extra reference to avoid race with dm_rh_stop_recovery */ | ||
604 | atomic_inc(&rh->recovery_in_flight); | ||
605 | |||
606 | while (!down_trylock(&rh->recovery_count)) { | ||
607 | atomic_inc(&rh->recovery_in_flight); | ||
608 | if (__rh_recovery_prepare(rh) <= 0) { | ||
609 | atomic_dec(&rh->recovery_in_flight); | ||
610 | up(&rh->recovery_count); | ||
611 | break; | ||
612 | } | ||
613 | } | ||
614 | |||
615 | /* Drop the extra reference */ | ||
616 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | ||
617 | rh->wakeup_all_recovery_waiters(rh->context); | ||
618 | } | ||
619 | EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); | ||
620 | |||
621 | /* | ||
622 | * Returns any quiesced regions. | ||
623 | */ | ||
624 | struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) | ||
625 | { | ||
626 | struct dm_region *reg = NULL; | ||
627 | |||
628 | spin_lock_irq(&rh->region_lock); | ||
629 | if (!list_empty(&rh->quiesced_regions)) { | ||
630 | reg = list_entry(rh->quiesced_regions.next, | ||
631 | struct dm_region, list); | ||
632 | list_del_init(®->list); /* remove from the quiesced list */ | ||
633 | } | ||
634 | spin_unlock_irq(&rh->region_lock); | ||
635 | |||
636 | return reg; | ||
637 | } | ||
638 | EXPORT_SYMBOL_GPL(dm_rh_recovery_start); | ||
639 | |||
640 | void dm_rh_recovery_end(struct dm_region *reg, int success) | ||
641 | { | ||
642 | struct dm_region_hash *rh = reg->rh; | ||
643 | |||
644 | spin_lock_irq(&rh->region_lock); | ||
645 | if (success) | ||
646 | list_add(®->list, ®->rh->recovered_regions); | ||
647 | else { | ||
648 | reg->state = DM_RH_NOSYNC; | ||
649 | list_add(®->list, ®->rh->failed_recovered_regions); | ||
650 | } | ||
651 | spin_unlock_irq(&rh->region_lock); | ||
652 | |||
653 | rh->wakeup_workers(rh->context); | ||
654 | } | ||
655 | EXPORT_SYMBOL_GPL(dm_rh_recovery_end); | ||
656 | |||
657 | /* Return recovery in flight count. */ | ||
658 | int dm_rh_recovery_in_flight(struct dm_region_hash *rh) | ||
659 | { | ||
660 | return atomic_read(&rh->recovery_in_flight); | ||
661 | } | ||
662 | EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); | ||
663 | |||
664 | int dm_rh_flush(struct dm_region_hash *rh) | ||
665 | { | ||
666 | return rh->log->type->flush(rh->log); | ||
667 | } | ||
668 | EXPORT_SYMBOL_GPL(dm_rh_flush); | ||
669 | |||
670 | void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) | ||
671 | { | ||
672 | struct dm_region *reg; | ||
673 | |||
674 | read_lock(&rh->hash_lock); | ||
675 | reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); | ||
676 | bio_list_add(®->delayed_bios, bio); | ||
677 | read_unlock(&rh->hash_lock); | ||
678 | } | ||
679 | EXPORT_SYMBOL_GPL(dm_rh_delay); | ||
680 | |||
681 | void dm_rh_stop_recovery(struct dm_region_hash *rh) | ||
682 | { | ||
683 | int i; | ||
684 | |||
685 | /* wait for any recovering regions */ | ||
686 | for (i = 0; i < rh->max_recovery; i++) | ||
687 | down(&rh->recovery_count); | ||
688 | } | ||
689 | EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); | ||
690 | |||
691 | void dm_rh_start_recovery(struct dm_region_hash *rh) | ||
692 | { | ||
693 | int i; | ||
694 | |||
695 | for (i = 0; i < rh->max_recovery; i++) | ||
696 | up(&rh->recovery_count); | ||
697 | |||
698 | rh->wakeup_workers(rh->context); | ||
699 | } | ||
700 | EXPORT_SYMBOL_GPL(dm_rh_start_recovery); | ||
701 | |||
702 | MODULE_DESCRIPTION(DM_NAME " region hash"); | ||
703 | MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); | ||
704 | MODULE_LICENSE("GPL"); | ||