diff options
author | NeilBrown <neilb@suse.de> | 2009-12-13 20:49:49 -0500 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2009-12-13 20:49:49 -0500 |
commit | a2826aa92e2e14db372eda01d333267258944033 (patch) | |
tree | 9cdd3329205bf480a4782705a3db1738e3faae44 | |
parent | efa593390e70b0e3c39f6b2dca8876b6b1461e41 (diff) |
md: support barrier requests on all personalities.
Previously barriers were only supported on RAID1. This is because
other levels requires synchronisation across all devices and so needed
a different approach.
Here is that approach.
When a barrier arrives, we send a zero-length barrier to every active
device. When that completes - and if the original request was not
empty - we submit the barrier request itself (with the barrier flag
cleared) and then submit a fresh load of zero length barriers.
The barrier request itself is asynchronous, but any subsequent
request will block until the barrier completes.
The reason for clearing the barrier flag is that a barrier request is
allowed to fail. If we pass a non-empty barrier through a striping
raid level it is conceivable that part of it could succeed and part
could fail. That would be way too hard to deal with.
So if the first run of zero length barriers succeed, we assume all is
sufficiently well that we send the request and ignore errors in the
second run of barriers.
RAID5 needs extra care as write requests may not have been submitted
to the underlying devices yet. So we flush the stripe cache before
proceeding with the barrier.
Note that the second set of zero-length barriers are submitted
immediately after the original request is submitted. Thus when
a personality finds mddev->barrier to be set during make_request,
it should not return from make_request until the corresponding
per-device request(s) have been queued.
That will be done in later patches.
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Andre Noll <maan@systemlinux.org>
-rw-r--r-- | drivers/md/linear.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 105 | ||||
-rw-r--r-- | drivers/md/md.h | 12 | ||||
-rw-r--r-- | drivers/md/multipath.c | 2 | ||||
-rw-r--r-- | drivers/md/raid0.c | 2 | ||||
-rw-r--r-- | drivers/md/raid10.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5.c | 8 |
7 files changed, 126 insertions, 7 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1ceceb334d5e..3b3f77c4f249 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -292,7 +292,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
292 | int cpu; | 292 | int cpu; |
293 | 293 | ||
294 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 294 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
295 | bio_endio(bio, -EOPNOTSUPP); | 295 | md_barrier_request(mddev, bio); |
296 | return 0; | 296 | return 0; |
297 | } | 297 | } |
298 | 298 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index d79a40649799..569f25183db6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -213,12 +213,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
213 | return 0; | 213 | return 0; |
214 | } | 214 | } |
215 | rcu_read_lock(); | 215 | rcu_read_lock(); |
216 | if (mddev->suspended) { | 216 | if (mddev->suspended || mddev->barrier) { |
217 | DEFINE_WAIT(__wait); | 217 | DEFINE_WAIT(__wait); |
218 | for (;;) { | 218 | for (;;) { |
219 | prepare_to_wait(&mddev->sb_wait, &__wait, | 219 | prepare_to_wait(&mddev->sb_wait, &__wait, |
220 | TASK_UNINTERRUPTIBLE); | 220 | TASK_UNINTERRUPTIBLE); |
221 | if (!mddev->suspended) | 221 | if (!mddev->suspended && !mddev->barrier) |
222 | break; | 222 | break; |
223 | rcu_read_unlock(); | 223 | rcu_read_unlock(); |
224 | schedule(); | 224 | schedule(); |
@@ -260,10 +260,110 @@ static void mddev_resume(mddev_t *mddev) | |||
260 | 260 | ||
261 | int mddev_congested(mddev_t *mddev, int bits) | 261 | int mddev_congested(mddev_t *mddev, int bits) |
262 | { | 262 | { |
263 | if (mddev->barrier) | ||
264 | return 1; | ||
263 | return mddev->suspended; | 265 | return mddev->suspended; |
264 | } | 266 | } |
265 | EXPORT_SYMBOL(mddev_congested); | 267 | EXPORT_SYMBOL(mddev_congested); |
266 | 268 | ||
269 | /* | ||
270 | * Generic barrier handling for md | ||
271 | */ | ||
272 | |||
273 | #define POST_REQUEST_BARRIER ((void*)1) | ||
274 | |||
275 | static void md_end_barrier(struct bio *bio, int err) | ||
276 | { | ||
277 | mdk_rdev_t *rdev = bio->bi_private; | ||
278 | mddev_t *mddev = rdev->mddev; | ||
279 | if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) | ||
280 | set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); | ||
281 | |||
282 | rdev_dec_pending(rdev, mddev); | ||
283 | |||
284 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
285 | if (mddev->barrier == POST_REQUEST_BARRIER) { | ||
286 | /* This was a post-request barrier */ | ||
287 | mddev->barrier = NULL; | ||
288 | wake_up(&mddev->sb_wait); | ||
289 | } else | ||
290 | /* The pre-request barrier has finished */ | ||
291 | schedule_work(&mddev->barrier_work); | ||
292 | } | ||
293 | bio_put(bio); | ||
294 | } | ||
295 | |||
296 | static void submit_barriers(mddev_t *mddev) | ||
297 | { | ||
298 | mdk_rdev_t *rdev; | ||
299 | |||
300 | rcu_read_lock(); | ||
301 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
302 | if (rdev->raid_disk >= 0 && | ||
303 | !test_bit(Faulty, &rdev->flags)) { | ||
304 | /* Take two references, one is dropped | ||
305 | * when request finishes, one after | ||
306 | * we reclaim rcu_read_lock | ||
307 | */ | ||
308 | struct bio *bi; | ||
309 | atomic_inc(&rdev->nr_pending); | ||
310 | atomic_inc(&rdev->nr_pending); | ||
311 | rcu_read_unlock(); | ||
312 | bi = bio_alloc(GFP_KERNEL, 0); | ||
313 | bi->bi_end_io = md_end_barrier; | ||
314 | bi->bi_private = rdev; | ||
315 | bi->bi_bdev = rdev->bdev; | ||
316 | atomic_inc(&mddev->flush_pending); | ||
317 | submit_bio(WRITE_BARRIER, bi); | ||
318 | rcu_read_lock(); | ||
319 | rdev_dec_pending(rdev, mddev); | ||
320 | } | ||
321 | rcu_read_unlock(); | ||
322 | } | ||
323 | |||
324 | static void md_submit_barrier(struct work_struct *ws) | ||
325 | { | ||
326 | mddev_t *mddev = container_of(ws, mddev_t, barrier_work); | ||
327 | struct bio *bio = mddev->barrier; | ||
328 | |||
329 | atomic_set(&mddev->flush_pending, 1); | ||
330 | |||
331 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
332 | bio_endio(bio, -EOPNOTSUPP); | ||
333 | else if (bio->bi_size == 0) | ||
334 | /* an empty barrier - all done */ | ||
335 | bio_endio(bio, 0); | ||
336 | else { | ||
337 | bio->bi_rw &= ~(1<<BIO_RW_BARRIER); | ||
338 | if (mddev->pers->make_request(mddev->queue, bio)) | ||
339 | generic_make_request(bio); | ||
340 | mddev->barrier = POST_REQUEST_BARRIER; | ||
341 | submit_barriers(mddev); | ||
342 | } | ||
343 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
344 | mddev->barrier = NULL; | ||
345 | wake_up(&mddev->sb_wait); | ||
346 | } | ||
347 | } | ||
348 | |||
349 | void md_barrier_request(mddev_t *mddev, struct bio *bio) | ||
350 | { | ||
351 | spin_lock_irq(&mddev->write_lock); | ||
352 | wait_event_lock_irq(mddev->sb_wait, | ||
353 | !mddev->barrier, | ||
354 | mddev->write_lock, /*nothing*/); | ||
355 | mddev->barrier = bio; | ||
356 | spin_unlock_irq(&mddev->write_lock); | ||
357 | |||
358 | atomic_set(&mddev->flush_pending, 1); | ||
359 | INIT_WORK(&mddev->barrier_work, md_submit_barrier); | ||
360 | |||
361 | submit_barriers(mddev); | ||
362 | |||
363 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
364 | schedule_work(&mddev->barrier_work); | ||
365 | } | ||
366 | EXPORT_SYMBOL(md_barrier_request); | ||
267 | 367 | ||
268 | static inline mddev_t *mddev_get(mddev_t *mddev) | 368 | static inline mddev_t *mddev_get(mddev_t *mddev) |
269 | { | 369 | { |
@@ -371,6 +471,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
371 | atomic_set(&new->openers, 0); | 471 | atomic_set(&new->openers, 0); |
372 | atomic_set(&new->active_io, 0); | 472 | atomic_set(&new->active_io, 0); |
373 | spin_lock_init(&new->write_lock); | 473 | spin_lock_init(&new->write_lock); |
474 | atomic_set(&new->flush_pending, 0); | ||
374 | init_waitqueue_head(&new->sb_wait); | 475 | init_waitqueue_head(&new->sb_wait); |
375 | init_waitqueue_head(&new->recovery_wait); | 476 | init_waitqueue_head(&new->recovery_wait); |
376 | new->reshape_position = MaxSector; | 477 | new->reshape_position = MaxSector; |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 87430fea2875..cb036868a9e9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -292,6 +292,17 @@ struct mddev_s | |||
292 | struct mutex bitmap_mutex; | 292 | struct mutex bitmap_mutex; |
293 | 293 | ||
294 | struct list_head all_mddevs; | 294 | struct list_head all_mddevs; |
295 | |||
296 | /* Generic barrier handling. | ||
297 | * If there is a pending barrier request, all other | ||
298 | * writes are blocked while the devices are flushed. | ||
299 | * The last to finish a flush schedules a worker to | ||
300 | * submit the barrier request (without the barrier flag), | ||
301 | * then submit more flush requests. | ||
302 | */ | ||
303 | struct bio *barrier; | ||
304 | atomic_t flush_pending; | ||
305 | struct work_struct barrier_work; | ||
295 | }; | 306 | }; |
296 | 307 | ||
297 | 308 | ||
@@ -432,6 +443,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); | |||
432 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); | 443 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); |
433 | 444 | ||
434 | extern int mddev_congested(mddev_t *mddev, int bits); | 445 | extern int mddev_congested(mddev_t *mddev, int bits); |
446 | extern void md_barrier_request(mddev_t *mddev, struct bio *bio); | ||
435 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 447 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
436 | sector_t sector, int size, struct page *page); | 448 | sector_t sector, int size, struct page *page); |
437 | extern void md_super_wait(mddev_t *mddev); | 449 | extern void md_super_wait(mddev_t *mddev); |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index ee7646f974a0..cbc0a99f3796 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) | |||
145 | int cpu; | 145 | int cpu; |
146 | 146 | ||
147 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 147 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
148 | bio_endio(bio, -EOPNOTSUPP); | 148 | md_barrier_request(mddev, bio); |
149 | return 0; | 149 | return 0; |
150 | } | 150 | } |
151 | 151 | ||
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d3a4ce06015a..122d07af5b54 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) | |||
453 | int cpu; | 453 | int cpu; |
454 | 454 | ||
455 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 455 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
456 | bio_endio(bio, -EOPNOTSUPP); | 456 | md_barrier_request(mddev, bio); |
457 | return 0; | 457 | return 0; |
458 | } | 458 | } |
459 | 459 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c2cb7b87b440..2fbf867f8b30 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
804 | mdk_rdev_t *blocked_rdev; | 804 | mdk_rdev_t *blocked_rdev; |
805 | 805 | ||
806 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 806 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
807 | bio_endio(bio, -EOPNOTSUPP); | 807 | md_barrier_request(mddev, bio); |
808 | return 0; | 808 | return 0; |
809 | } | 809 | } |
810 | 810 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 34cb065f6d66..8c9395f2028f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -3865,7 +3865,13 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3865 | int cpu, remaining; | 3865 | int cpu, remaining; |
3866 | 3866 | ||
3867 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { | 3867 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { |
3868 | bio_endio(bi, -EOPNOTSUPP); | 3868 | /* Drain all pending writes. We only really need |
3869 | * to ensure they have been submitted, but this is | ||
3870 | * easier. | ||
3871 | */ | ||
3872 | mddev->pers->quiesce(mddev, 1); | ||
3873 | mddev->pers->quiesce(mddev, 0); | ||
3874 | md_barrier_request(mddev, bi); | ||
3869 | return 0; | 3875 | return 0; |
3870 | } | 3876 | } |
3871 | 3877 | ||