aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-12-13 20:49:49 -0500
committerNeilBrown <neilb@suse.de>2009-12-13 20:49:49 -0500
commita2826aa92e2e14db372eda01d333267258944033 (patch)
tree9cdd3329205bf480a4782705a3db1738e3faae44 /drivers/md/md.c
parentefa593390e70b0e3c39f6b2dca8876b6b1461e41 (diff)
md: support barrier requests on all personalities.
Previously barriers were only supported on RAID1. This is because other levels requires synchronisation across all devices and so needed a different approach. Here is that approach. When a barrier arrives, we send a zero-length barrier to every active device. When that completes - and if the original request was not empty - we submit the barrier request itself (with the barrier flag cleared) and then submit a fresh load of zero length barriers. The barrier request itself is asynchronous, but any subsequent request will block until the barrier completes. The reason for clearing the barrier flag is that a barrier request is allowed to fail. If we pass a non-empty barrier through a striping raid level it is conceivable that part of it could succeed and part could fail. That would be way too hard to deal with. So if the first run of zero length barriers succeed, we assume all is sufficiently well that we send the request and ignore errors in the second run of barriers. RAID5 needs extra care as write requests may not have been submitted to the underlying devices yet. So we flush the stripe cache before proceeding with the barrier. Note that the second set of zero-length barriers are submitted immediately after the original request is submitted. Thus when a personality finds mddev->barrier to be set during make_request, it should not return from make_request until the corresponding per-device request(s) have been queued. That will be done in later patches. Signed-off-by: NeilBrown <neilb@suse.de> Reviewed-by: Andre Noll <maan@systemlinux.org>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c105
1 files changed, 103 insertions, 2 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d79a40649799..569f25183db6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -213,12 +213,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
213 return 0; 213 return 0;
214 } 214 }
215 rcu_read_lock(); 215 rcu_read_lock();
216 if (mddev->suspended) { 216 if (mddev->suspended || mddev->barrier) {
217 DEFINE_WAIT(__wait); 217 DEFINE_WAIT(__wait);
218 for (;;) { 218 for (;;) {
219 prepare_to_wait(&mddev->sb_wait, &__wait, 219 prepare_to_wait(&mddev->sb_wait, &__wait,
220 TASK_UNINTERRUPTIBLE); 220 TASK_UNINTERRUPTIBLE);
221 if (!mddev->suspended) 221 if (!mddev->suspended && !mddev->barrier)
222 break; 222 break;
223 rcu_read_unlock(); 223 rcu_read_unlock();
224 schedule(); 224 schedule();
@@ -260,10 +260,110 @@ static void mddev_resume(mddev_t *mddev)
260 260
261int mddev_congested(mddev_t *mddev, int bits) 261int mddev_congested(mddev_t *mddev, int bits)
262{ 262{
263 if (mddev->barrier)
264 return 1;
263 return mddev->suspended; 265 return mddev->suspended;
264} 266}
265EXPORT_SYMBOL(mddev_congested); 267EXPORT_SYMBOL(mddev_congested);
266 268
269/*
270 * Generic barrier handling for md
271 */
272
273#define POST_REQUEST_BARRIER ((void*)1)
274
275static void md_end_barrier(struct bio *bio, int err)
276{
277 mdk_rdev_t *rdev = bio->bi_private;
278 mddev_t *mddev = rdev->mddev;
279 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
280 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
281
282 rdev_dec_pending(rdev, mddev);
283
284 if (atomic_dec_and_test(&mddev->flush_pending)) {
285 if (mddev->barrier == POST_REQUEST_BARRIER) {
286 /* This was a post-request barrier */
287 mddev->barrier = NULL;
288 wake_up(&mddev->sb_wait);
289 } else
290 /* The pre-request barrier has finished */
291 schedule_work(&mddev->barrier_work);
292 }
293 bio_put(bio);
294}
295
296static void submit_barriers(mddev_t *mddev)
297{
298 mdk_rdev_t *rdev;
299
300 rcu_read_lock();
301 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
302 if (rdev->raid_disk >= 0 &&
303 !test_bit(Faulty, &rdev->flags)) {
304 /* Take two references, one is dropped
305 * when request finishes, one after
306 * we reclaim rcu_read_lock
307 */
308 struct bio *bi;
309 atomic_inc(&rdev->nr_pending);
310 atomic_inc(&rdev->nr_pending);
311 rcu_read_unlock();
312 bi = bio_alloc(GFP_KERNEL, 0);
313 bi->bi_end_io = md_end_barrier;
314 bi->bi_private = rdev;
315 bi->bi_bdev = rdev->bdev;
316 atomic_inc(&mddev->flush_pending);
317 submit_bio(WRITE_BARRIER, bi);
318 rcu_read_lock();
319 rdev_dec_pending(rdev, mddev);
320 }
321 rcu_read_unlock();
322}
323
324static void md_submit_barrier(struct work_struct *ws)
325{
326 mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
327 struct bio *bio = mddev->barrier;
328
329 atomic_set(&mddev->flush_pending, 1);
330
331 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
332 bio_endio(bio, -EOPNOTSUPP);
333 else if (bio->bi_size == 0)
334 /* an empty barrier - all done */
335 bio_endio(bio, 0);
336 else {
337 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
338 if (mddev->pers->make_request(mddev->queue, bio))
339 generic_make_request(bio);
340 mddev->barrier = POST_REQUEST_BARRIER;
341 submit_barriers(mddev);
342 }
343 if (atomic_dec_and_test(&mddev->flush_pending)) {
344 mddev->barrier = NULL;
345 wake_up(&mddev->sb_wait);
346 }
347}
348
349void md_barrier_request(mddev_t *mddev, struct bio *bio)
350{
351 spin_lock_irq(&mddev->write_lock);
352 wait_event_lock_irq(mddev->sb_wait,
353 !mddev->barrier,
354 mddev->write_lock, /*nothing*/);
355 mddev->barrier = bio;
356 spin_unlock_irq(&mddev->write_lock);
357
358 atomic_set(&mddev->flush_pending, 1);
359 INIT_WORK(&mddev->barrier_work, md_submit_barrier);
360
361 submit_barriers(mddev);
362
363 if (atomic_dec_and_test(&mddev->flush_pending))
364 schedule_work(&mddev->barrier_work);
365}
366EXPORT_SYMBOL(md_barrier_request);
267 367
268static inline mddev_t *mddev_get(mddev_t *mddev) 368static inline mddev_t *mddev_get(mddev_t *mddev)
269{ 369{
@@ -371,6 +471,7 @@ static mddev_t * mddev_find(dev_t unit)
371 atomic_set(&new->openers, 0); 471 atomic_set(&new->openers, 0);
372 atomic_set(&new->active_io, 0); 472 atomic_set(&new->active_io, 0);
373 spin_lock_init(&new->write_lock); 473 spin_lock_init(&new->write_lock);
474 atomic_set(&new->flush_pending, 0);
374 init_waitqueue_head(&new->sb_wait); 475 init_waitqueue_head(&new->sb_wait);
375 init_waitqueue_head(&new->recovery_wait); 476 init_waitqueue_head(&new->recovery_wait);
376 new->reshape_position = MaxSector; 477 new->reshape_position = MaxSector;