aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/md.c99
-rw-r--r--drivers/md/raid1.c134
3 files changed, 177 insertions, 61 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 220273e81ed6..51315302a85e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -301,7 +301,7 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
301 page); 301 page);
302 302
303 if (wait) 303 if (wait)
304 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 304 md_super_wait(mddev);
305 return 0; 305 return 0;
306} 306}
307 307
@@ -828,8 +828,7 @@ int bitmap_unplug(struct bitmap *bitmap)
828 wake_up_process(bitmap->writeback_daemon->tsk)); 828 wake_up_process(bitmap->writeback_daemon->tsk));
829 spin_unlock_irq(&bitmap->write_lock); 829 spin_unlock_irq(&bitmap->write_lock);
830 } else 830 } else
831 wait_event(bitmap->mddev->sb_wait, 831 md_super_wait(bitmap->mddev);
832 atomic_read(&bitmap->mddev->pending_writes)==0);
833 } 832 }
834 return 0; 833 return 0;
835} 834}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index caa4add00c1b..199016932de5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -330,18 +330,46 @@ static void free_disk_sb(mdk_rdev_t * rdev)
330static int super_written(struct bio *bio, unsigned int bytes_done, int error) 330static int super_written(struct bio *bio, unsigned int bytes_done, int error)
331{ 331{
332 mdk_rdev_t *rdev = bio->bi_private; 332 mdk_rdev_t *rdev = bio->bi_private;
333 mddev_t *mddev = rdev->mddev;
333 if (bio->bi_size) 334 if (bio->bi_size)
334 return 1; 335 return 1;
335 336
336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
337 md_error(rdev->mddev, rdev); 338 md_error(mddev, rdev);
338 339
339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 if (atomic_dec_and_test(&mddev->pending_writes))
340 wake_up(&rdev->mddev->sb_wait); 341 wake_up(&mddev->sb_wait);
341 bio_put(bio); 342 bio_put(bio);
342 return 0; 343 return 0;
343} 344}
344 345
346static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
347{
348 struct bio *bio2 = bio->bi_private;
349 mdk_rdev_t *rdev = bio2->bi_private;
350 mddev_t *mddev = rdev->mddev;
351 if (bio->bi_size)
352 return 1;
353
354 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
355 error == -EOPNOTSUPP) {
356 unsigned long flags;
357 /* barriers don't appear to be supported :-( */
358 set_bit(BarriersNotsupp, &rdev->flags);
359 mddev->barriers_work = 0;
360 spin_lock_irqsave(&mddev->write_lock, flags);
361 bio2->bi_next = mddev->biolist;
362 mddev->biolist = bio2;
363 spin_unlock_irqrestore(&mddev->write_lock, flags);
364 wake_up(&mddev->sb_wait);
365 bio_put(bio);
366 return 0;
367 }
368 bio_put(bio2);
369 bio->bi_private = rdev;
370 return super_written(bio, bytes_done, error);
371}
372
345void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 373void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
346 sector_t sector, int size, struct page *page) 374 sector_t sector, int size, struct page *page)
347{ 375{
@@ -350,16 +378,54 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
350 * and decrement it on completion, waking up sb_wait 378 * and decrement it on completion, waking up sb_wait
351 * if zero is reached. 379 * if zero is reached.
352 * If an error occurred, call md_error 380 * If an error occurred, call md_error
381 *
382 * As we might need to resubmit the request if BIO_RW_BARRIER
383 * causes ENOTSUPP, we allocate a spare bio...
353 */ 384 */
354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 385 struct bio *bio = bio_alloc(GFP_NOIO, 1);
386 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
355 387
356 bio->bi_bdev = rdev->bdev; 388 bio->bi_bdev = rdev->bdev;
357 bio->bi_sector = sector; 389 bio->bi_sector = sector;
358 bio_add_page(bio, page, size, 0); 390 bio_add_page(bio, page, size, 0);
359 bio->bi_private = rdev; 391 bio->bi_private = rdev;
360 bio->bi_end_io = super_written; 392 bio->bi_end_io = super_written;
393 bio->bi_rw = rw;
394
361 atomic_inc(&mddev->pending_writes); 395 atomic_inc(&mddev->pending_writes);
362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 396 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
397 struct bio *rbio;
398 rw |= (1<<BIO_RW_BARRIER);
399 rbio = bio_clone(bio, GFP_NOIO);
400 rbio->bi_private = bio;
401 rbio->bi_end_io = super_written_barrier;
402 submit_bio(rw, rbio);
403 } else
404 submit_bio(rw, bio);
405}
406
407void md_super_wait(mddev_t *mddev)
408{
409 /* wait for all superblock writes that were scheduled to complete.
410 * if any had to be retried (due to BARRIER problems), retry them
411 */
412 DEFINE_WAIT(wq);
413 for(;;) {
414 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
415 if (atomic_read(&mddev->pending_writes)==0)
416 break;
417 while (mddev->biolist) {
418 struct bio *bio;
419 spin_lock_irq(&mddev->write_lock);
420 bio = mddev->biolist;
421 mddev->biolist = bio->bi_next ;
422 bio->bi_next = NULL;
423 spin_unlock_irq(&mddev->write_lock);
424 submit_bio(bio->bi_rw, bio);
425 }
426 schedule();
427 }
428 finish_wait(&mddev->sb_wait, &wq);
363} 429}
364 430
365static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 431static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
@@ -1382,7 +1448,7 @@ static void md_update_sb(mddev_t * mddev)
1382 int sync_req; 1448 int sync_req;
1383 1449
1384repeat: 1450repeat:
1385 spin_lock(&mddev->write_lock); 1451 spin_lock_irq(&mddev->write_lock);
1386 sync_req = mddev->in_sync; 1452 sync_req = mddev->in_sync;
1387 mddev->utime = get_seconds(); 1453 mddev->utime = get_seconds();
1388 mddev->events ++; 1454 mddev->events ++;
@@ -1405,11 +1471,11 @@ repeat:
1405 */ 1471 */
1406 if (!mddev->persistent) { 1472 if (!mddev->persistent) {
1407 mddev->sb_dirty = 0; 1473 mddev->sb_dirty = 0;
1408 spin_unlock(&mddev->write_lock); 1474 spin_unlock_irq(&mddev->write_lock);
1409 wake_up(&mddev->sb_wait); 1475 wake_up(&mddev->sb_wait);
1410 return; 1476 return;
1411 } 1477 }
1412 spin_unlock(&mddev->write_lock); 1478 spin_unlock_irq(&mddev->write_lock);
1413 1479
1414 dprintk(KERN_INFO 1480 dprintk(KERN_INFO
1415 "md: updating %s RAID superblock on device (in sync %d)\n", 1481 "md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1437,17 +1503,17 @@ repeat:
1437 /* only need to write one superblock... */ 1503 /* only need to write one superblock... */
1438 break; 1504 break;
1439 } 1505 }
1440 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1506 md_super_wait(mddev);
1441 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1507 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1442 1508
1443 spin_lock(&mddev->write_lock); 1509 spin_lock_irq(&mddev->write_lock);
1444 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1510 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1445 /* have to write it out again */ 1511 /* have to write it out again */
1446 spin_unlock(&mddev->write_lock); 1512 spin_unlock_irq(&mddev->write_lock);
1447 goto repeat; 1513 goto repeat;
1448 } 1514 }
1449 mddev->sb_dirty = 0; 1515 mddev->sb_dirty = 0;
1450 spin_unlock(&mddev->write_lock); 1516 spin_unlock_irq(&mddev->write_lock);
1451 wake_up(&mddev->sb_wait); 1517 wake_up(&mddev->sb_wait);
1452 1518
1453} 1519}
@@ -1989,6 +2055,7 @@ static int do_md_run(mddev_t * mddev)
1989 2055
1990 mddev->recovery = 0; 2056 mddev->recovery = 0;
1991 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2057 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2058 mddev->barriers_work = 1;
1992 2059
1993 /* before we start the array running, initialise the bitmap */ 2060 /* before we start the array running, initialise the bitmap */
1994 err = bitmap_create(mddev); 2061 err = bitmap_create(mddev);
@@ -2107,7 +2174,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2107 mddev->ro = 1; 2174 mddev->ro = 1;
2108 } else { 2175 } else {
2109 bitmap_flush(mddev); 2176 bitmap_flush(mddev);
2110 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 2177 md_super_wait(mddev);
2111 if (mddev->ro) 2178 if (mddev->ro)
2112 set_disk_ro(disk, 0); 2179 set_disk_ro(disk, 0);
2113 blk_queue_make_request(mddev->queue, md_fail_request); 2180 blk_queue_make_request(mddev->queue, md_fail_request);
@@ -3796,13 +3863,13 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
3796 3863
3797 atomic_inc(&mddev->writes_pending); 3864 atomic_inc(&mddev->writes_pending);
3798 if (mddev->in_sync) { 3865 if (mddev->in_sync) {
3799 spin_lock(&mddev->write_lock); 3866 spin_lock_irq(&mddev->write_lock);
3800 if (mddev->in_sync) { 3867 if (mddev->in_sync) {
3801 mddev->in_sync = 0; 3868 mddev->in_sync = 0;
3802 mddev->sb_dirty = 1; 3869 mddev->sb_dirty = 1;
3803 md_wakeup_thread(mddev->thread); 3870 md_wakeup_thread(mddev->thread);
3804 } 3871 }
3805 spin_unlock(&mddev->write_lock); 3872 spin_unlock_irq(&mddev->write_lock);
3806 } 3873 }
3807 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3874 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3808} 3875}
@@ -4112,7 +4179,7 @@ void md_check_recovery(mddev_t *mddev)
4112 if (mddev_trylock(mddev)==0) { 4179 if (mddev_trylock(mddev)==0) {
4113 int spares =0; 4180 int spares =0;
4114 4181
4115 spin_lock(&mddev->write_lock); 4182 spin_lock_irq(&mddev->write_lock);
4116 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4183 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
4117 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4184 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
4118 mddev->in_sync = 1; 4185 mddev->in_sync = 1;
@@ -4120,7 +4187,7 @@ void md_check_recovery(mddev_t *mddev)
4120 } 4187 }
4121 if (mddev->safemode == 1) 4188 if (mddev->safemode == 1)
4122 mddev->safemode = 0; 4189 mddev->safemode = 0;
4123 spin_unlock(&mddev->write_lock); 4190 spin_unlock_irq(&mddev->write_lock);
4124 4191
4125 if (mddev->sb_dirty) 4192 if (mddev->sb_dirty)
4126 md_update_sb(mddev); 4193 md_update_sb(mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb6b866c28f5..1cbf51fbd43f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
301{ 301{
302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
304 int mirror, behind; 304 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
305 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
306 306
307 if (bio->bi_size) 307 if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
311 if (r1_bio->bios[mirror] == bio) 311 if (r1_bio->bios[mirror] == bio)
312 break; 312 break;
313 313
314 /* 314 if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
315 * this branch is our 'one mirror IO has finished' event handler: 315 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
316 */ 316 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
317 if (!uptodate) { 317 r1_bio->mddev->barriers_work = 0;
318 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 318 } else {
319 /* an I/O failed, we can't clear the bitmap */
320 set_bit(R1BIO_Degraded, &r1_bio->state);
321 } else
322 /* 319 /*
323 * Set R1BIO_Uptodate in our master bio, so that 320 * this branch is our 'one mirror IO has finished' event handler:
324 * we will return a good error code for to the higher
325 * levels even if IO on some other mirrored buffer fails.
326 *
327 * The 'master' represents the composite IO operation to
328 * user-side. So if something waits for IO, then it will
329 * wait for the 'master' bio.
330 */ 321 */
331 set_bit(R1BIO_Uptodate, &r1_bio->state); 322 r1_bio->bios[mirror] = NULL;
332 323 bio_put(bio);
333 update_head_pos(mirror, r1_bio); 324 if (!uptodate) {
334 325 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 326 /* an I/O failed, we can't clear the bitmap */
336 if (behind) { 327 set_bit(R1BIO_Degraded, &r1_bio->state);
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 328 } else
338 atomic_dec(&r1_bio->behind_remaining); 329 /*
339 330 * Set R1BIO_Uptodate in our master bio, so that
340 /* In behind mode, we ACK the master bio once the I/O has safely 331 * we will return a good error code for to the higher
341 * reached all non-writemostly disks. Setting the Returned bit 332 * levels even if IO on some other mirrored buffer fails.
342 * ensures that this gets done only once -- we don't ever want to 333 *
343 * return -EIO here, instead we'll wait */ 334 * The 'master' represents the composite IO operation to
344 335 * user-side. So if something waits for IO, then it will
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 336 * wait for the 'master' bio.
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 337 */
347 /* Maybe we can return now */ 338 set_bit(R1BIO_Uptodate, &r1_bio->state);
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 339
349 struct bio *mbio = r1_bio->master_bio; 340 update_head_pos(mirror, r1_bio);
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 341
351 (unsigned long long) mbio->bi_sector, 342 if (behind) {
352 (unsigned long long) mbio->bi_sector + 343 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
353 (mbio->bi_size >> 9) - 1); 344 atomic_dec(&r1_bio->behind_remaining);
354 bio_endio(mbio, mbio->bi_size, 0); 345
346 /* In behind mode, we ACK the master bio once the I/O has safely
347 * reached all non-writemostly disks. Setting the Returned bit
348 * ensures that this gets done only once -- we don't ever want to
349 * return -EIO here, instead we'll wait */
350
351 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
352 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
353 /* Maybe we can return now */
354 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
355 struct bio *mbio = r1_bio->master_bio;
356 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
357 (unsigned long long) mbio->bi_sector,
358 (unsigned long long) mbio->bi_sector +
359 (mbio->bi_size >> 9) - 1);
360 bio_endio(mbio, mbio->bi_size, 0);
361 }
355 } 362 }
356 } 363 }
357 } 364 }
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
361 * already. 368 * already.
362 */ 369 */
363 if (atomic_dec_and_test(&r1_bio->remaining)) { 370 if (atomic_dec_and_test(&r1_bio->remaining)) {
371 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
372 reschedule_retry(r1_bio);
373 /* Don't dec_pending yet, we want to hold
374 * the reference over the retry
375 */
376 return 0;
377 }
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 378 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */ 379 /* free extra copy of the data pages */
380/* FIXME bio has been freed!!! */
366 int i = bio->bi_vcnt; 381 int i = bio->bi_vcnt;
367 while (i--) 382 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page); 383 __free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
648 struct bio_list bl; 663 struct bio_list bl;
649 struct page **behind_pages = NULL; 664 struct page **behind_pages = NULL;
650 const int rw = bio_data_dir(bio); 665 const int rw = bio_data_dir(bio);
666 int do_barriers;
651 667
652 if (unlikely(bio_barrier(bio))) { 668 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
653 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 669 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
654 return 0; 670 return 0;
655 } 671 }
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
759 atomic_set(&r1_bio->remaining, 0); 775 atomic_set(&r1_bio->remaining, 0);
760 atomic_set(&r1_bio->behind_remaining, 0); 776 atomic_set(&r1_bio->behind_remaining, 0);
761 777
778 do_barriers = bio->bi_rw & BIO_RW_BARRIER;
779 if (do_barriers)
780 set_bit(R1BIO_Barrier, &r1_bio->state);
781
762 bio_list_init(&bl); 782 bio_list_init(&bl);
763 for (i = 0; i < disks; i++) { 783 for (i = 0; i < disks; i++) {
764 struct bio *mbio; 784 struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
771 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 791 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
772 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 792 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
773 mbio->bi_end_io = raid1_end_write_request; 793 mbio->bi_end_io = raid1_end_write_request;
774 mbio->bi_rw = WRITE; 794 mbio->bi_rw = WRITE | do_barriers;
775 mbio->bi_private = r1_bio; 795 mbio->bi_private = r1_bio;
776 796
777 if (behind_pages) { 797 if (behind_pages) {
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
1153 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1173 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1154 sync_request_write(mddev, r1_bio); 1174 sync_request_write(mddev, r1_bio);
1155 unplug = 1; 1175 unplug = 1;
1176 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1177 /* some requests in the r1bio were BIO_RW_BARRIER
1178 * requests which failed with -ENOTSUPP. Hohumm..
1179 * Better resubmit without the barrier.
1180 * We know which devices to resubmit for, because
1181 * all others have had their bios[] entry cleared.
1182 */
1183 int i;
1184 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1185 clear_bit(R1BIO_Barrier, &r1_bio->state);
1186 for (i=0; i < conf->raid_disks; i++)
1187 if (r1_bio->bios[i]) {
1188 struct bio_vec *bvec;
1189 int j;
1190
1191 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1192 /* copy pages from the failed bio, as
1193 * this might be a write-behind device */
1194 __bio_for_each_segment(bvec, bio, j, 0)
1195 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1196 bio_put(r1_bio->bios[i]);
1197 bio->bi_sector = r1_bio->sector +
1198 conf->mirrors[i].rdev->data_offset;
1199 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1200 bio->bi_end_io = raid1_end_write_request;
1201 bio->bi_rw = WRITE;
1202 bio->bi_private = r1_bio;
1203 r1_bio->bios[i] = bio;
1204 generic_make_request(bio);
1205 }
1156 } else { 1206 } else {
1157 int disk; 1207 int disk;
1158 bio = r1_bio->bios[r1_bio->read_disk]; 1208 bio = r1_bio->bios[r1_bio->read_disk];