aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-12-13 20:49:49 -0500
committerNeilBrown <neilb@suse.de>2009-12-13 20:49:49 -0500
commita2826aa92e2e14db372eda01d333267258944033 (patch)
tree9cdd3329205bf480a4782705a3db1738e3faae44
parentefa593390e70b0e3c39f6b2dca8876b6b1461e41 (diff)
md: support barrier requests on all personalities.
Previously barriers were only supported on RAID1. This is because other levels requires synchronisation across all devices and so needed a different approach. Here is that approach. When a barrier arrives, we send a zero-length barrier to every active device. When that completes - and if the original request was not empty - we submit the barrier request itself (with the barrier flag cleared) and then submit a fresh load of zero length barriers. The barrier request itself is asynchronous, but any subsequent request will block until the barrier completes. The reason for clearing the barrier flag is that a barrier request is allowed to fail. If we pass a non-empty barrier through a striping raid level it is conceivable that part of it could succeed and part could fail. That would be way too hard to deal with. So if the first run of zero length barriers succeed, we assume all is sufficiently well that we send the request and ignore errors in the second run of barriers. RAID5 needs extra care as write requests may not have been submitted to the underlying devices yet. So we flush the stripe cache before proceeding with the barrier. Note that the second set of zero-length barriers are submitted immediately after the original request is submitted. Thus when a personality finds mddev->barrier to be set during make_request, it should not return from make_request until the corresponding per-device request(s) have been queued. That will be done in later patches. Signed-off-by: NeilBrown <neilb@suse.de> Reviewed-by: Andre Noll <maan@systemlinux.org>
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/md.c105
-rw-r--r--drivers/md/md.h12
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/md/raid5.c8
7 files changed, 126 insertions, 7 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1ceceb334d5e..3b3f77c4f249 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -292,7 +292,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
292 int cpu; 292 int cpu;
293 293
294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
295 bio_endio(bio, -EOPNOTSUPP); 295 md_barrier_request(mddev, bio);
296 return 0; 296 return 0;
297 } 297 }
298 298
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d79a40649799..569f25183db6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -213,12 +213,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
213 return 0; 213 return 0;
214 } 214 }
215 rcu_read_lock(); 215 rcu_read_lock();
216 if (mddev->suspended) { 216 if (mddev->suspended || mddev->barrier) {
217 DEFINE_WAIT(__wait); 217 DEFINE_WAIT(__wait);
218 for (;;) { 218 for (;;) {
219 prepare_to_wait(&mddev->sb_wait, &__wait, 219 prepare_to_wait(&mddev->sb_wait, &__wait,
220 TASK_UNINTERRUPTIBLE); 220 TASK_UNINTERRUPTIBLE);
221 if (!mddev->suspended) 221 if (!mddev->suspended && !mddev->barrier)
222 break; 222 break;
223 rcu_read_unlock(); 223 rcu_read_unlock();
224 schedule(); 224 schedule();
@@ -260,10 +260,110 @@ static void mddev_resume(mddev_t *mddev)
260 260
261int mddev_congested(mddev_t *mddev, int bits) 261int mddev_congested(mddev_t *mddev, int bits)
262{ 262{
263 if (mddev->barrier)
264 return 1;
263 return mddev->suspended; 265 return mddev->suspended;
264} 266}
265EXPORT_SYMBOL(mddev_congested); 267EXPORT_SYMBOL(mddev_congested);
266 268
269/*
270 * Generic barrier handling for md
271 */
272
273#define POST_REQUEST_BARRIER ((void*)1)
274
275static void md_end_barrier(struct bio *bio, int err)
276{
277 mdk_rdev_t *rdev = bio->bi_private;
278 mddev_t *mddev = rdev->mddev;
279 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
280 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
281
282 rdev_dec_pending(rdev, mddev);
283
284 if (atomic_dec_and_test(&mddev->flush_pending)) {
285 if (mddev->barrier == POST_REQUEST_BARRIER) {
286 /* This was a post-request barrier */
287 mddev->barrier = NULL;
288 wake_up(&mddev->sb_wait);
289 } else
290 /* The pre-request barrier has finished */
291 schedule_work(&mddev->barrier_work);
292 }
293 bio_put(bio);
294}
295
296static void submit_barriers(mddev_t *mddev)
297{
298 mdk_rdev_t *rdev;
299
300 rcu_read_lock();
301 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
302 if (rdev->raid_disk >= 0 &&
303 !test_bit(Faulty, &rdev->flags)) {
304 /* Take two references, one is dropped
305 * when request finishes, one after
306 * we reclaim rcu_read_lock
307 */
308 struct bio *bi;
309 atomic_inc(&rdev->nr_pending);
310 atomic_inc(&rdev->nr_pending);
311 rcu_read_unlock();
312 bi = bio_alloc(GFP_KERNEL, 0);
313 bi->bi_end_io = md_end_barrier;
314 bi->bi_private = rdev;
315 bi->bi_bdev = rdev->bdev;
316 atomic_inc(&mddev->flush_pending);
317 submit_bio(WRITE_BARRIER, bi);
318 rcu_read_lock();
319 rdev_dec_pending(rdev, mddev);
320 }
321 rcu_read_unlock();
322}
323
324static void md_submit_barrier(struct work_struct *ws)
325{
326 mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
327 struct bio *bio = mddev->barrier;
328
329 atomic_set(&mddev->flush_pending, 1);
330
331 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
332 bio_endio(bio, -EOPNOTSUPP);
333 else if (bio->bi_size == 0)
334 /* an empty barrier - all done */
335 bio_endio(bio, 0);
336 else {
337 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
338 if (mddev->pers->make_request(mddev->queue, bio))
339 generic_make_request(bio);
340 mddev->barrier = POST_REQUEST_BARRIER;
341 submit_barriers(mddev);
342 }
343 if (atomic_dec_and_test(&mddev->flush_pending)) {
344 mddev->barrier = NULL;
345 wake_up(&mddev->sb_wait);
346 }
347}
348
349void md_barrier_request(mddev_t *mddev, struct bio *bio)
350{
351 spin_lock_irq(&mddev->write_lock);
352 wait_event_lock_irq(mddev->sb_wait,
353 !mddev->barrier,
354 mddev->write_lock, /*nothing*/);
355 mddev->barrier = bio;
356 spin_unlock_irq(&mddev->write_lock);
357
358 atomic_set(&mddev->flush_pending, 1);
359 INIT_WORK(&mddev->barrier_work, md_submit_barrier);
360
361 submit_barriers(mddev);
362
363 if (atomic_dec_and_test(&mddev->flush_pending))
364 schedule_work(&mddev->barrier_work);
365}
366EXPORT_SYMBOL(md_barrier_request);
267 367
268static inline mddev_t *mddev_get(mddev_t *mddev) 368static inline mddev_t *mddev_get(mddev_t *mddev)
269{ 369{
@@ -371,6 +471,7 @@ static mddev_t * mddev_find(dev_t unit)
371 atomic_set(&new->openers, 0); 471 atomic_set(&new->openers, 0);
372 atomic_set(&new->active_io, 0); 472 atomic_set(&new->active_io, 0);
373 spin_lock_init(&new->write_lock); 473 spin_lock_init(&new->write_lock);
474 atomic_set(&new->flush_pending, 0);
374 init_waitqueue_head(&new->sb_wait); 475 init_waitqueue_head(&new->sb_wait);
375 init_waitqueue_head(&new->recovery_wait); 476 init_waitqueue_head(&new->recovery_wait);
376 new->reshape_position = MaxSector; 477 new->reshape_position = MaxSector;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 87430fea2875..cb036868a9e9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -292,6 +292,17 @@ struct mddev_s
292 struct mutex bitmap_mutex; 292 struct mutex bitmap_mutex;
293 293
294 struct list_head all_mddevs; 294 struct list_head all_mddevs;
295
296 /* Generic barrier handling.
297 * If there is a pending barrier request, all other
298 * writes are blocked while the devices are flushed.
299 * The last to finish a flush schedules a worker to
300 * submit the barrier request (without the barrier flag),
301 * then submit more flush requests.
302 */
303 struct bio *barrier;
304 atomic_t flush_pending;
305 struct work_struct barrier_work;
295}; 306};
296 307
297 308
@@ -432,6 +443,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
432extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 443extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
433 444
434extern int mddev_congested(mddev_t *mddev, int bits); 445extern int mddev_congested(mddev_t *mddev, int bits);
446extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
435extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 447extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
436 sector_t sector, int size, struct page *page); 448 sector_t sector, int size, struct page *page);
437extern void md_super_wait(mddev_t *mddev); 449extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ee7646f974a0..cbc0a99f3796 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
145 int cpu; 145 int cpu;
146 146
147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
148 bio_endio(bio, -EOPNOTSUPP); 148 md_barrier_request(mddev, bio);
149 return 0; 149 return 0;
150 } 150 }
151 151
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d3a4ce06015a..122d07af5b54 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
453 int cpu; 453 int cpu;
454 454
455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
456 bio_endio(bio, -EOPNOTSUPP); 456 md_barrier_request(mddev, bio);
457 return 0; 457 return 0;
458 } 458 }
459 459
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b87b440..2fbf867f8b30 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
804 mdk_rdev_t *blocked_rdev; 804 mdk_rdev_t *blocked_rdev;
805 805
806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
807 bio_endio(bio, -EOPNOTSUPP); 807 md_barrier_request(mddev, bio);
808 return 0; 808 return 0;
809 } 809 }
810 810
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 34cb065f6d66..8c9395f2028f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3865,7 +3865,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
3865 int cpu, remaining; 3865 int cpu, remaining;
3866 3866
3867 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3867 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3868 bio_endio(bi, -EOPNOTSUPP); 3868 /* Drain all pending writes. We only really need
3869 * to ensure they have been submitted, but this is
3870 * easier.
3871 */
3872 mddev->pers->quiesce(mddev, 1);
3873 mddev->pers->quiesce(mddev, 0);
3874 md_barrier_request(mddev, bi);
3869 return 0; 3875 return 0;
3870 } 3876 }
3871 3877