aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-06-21 20:17:23 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 22:07:46 -0400
commit191ea9b2c7cc3ebbe0678834ab710d7d95ad3f9a (patch)
tree25ccd0d191742f4e25f37784370520d254aacc12 /drivers
parentaa3163f81654fa057039258e32a6811147bf0c14 (diff)
[PATCH] md: raid1 support for bitmap intent logging
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/raid1.c182
1 files changed, 154 insertions, 28 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3c5c916cb09e..0fd4c3bfc851 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -12,6 +12,15 @@
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> 12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 * 14 *
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync:
17 *
18 * - bitmap marked during normal i/o
19 * - bitmap used to skip nondirty blocks during sync
20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code
23 *
15 * This program is free software; you can redistribute it and/or modify 24 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 25 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option) 26 * the Free Software Foundation; either version 2, or (at your option)
@@ -22,7 +31,16 @@
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 32 */
24 33
34#include "dm-bio-list.h"
25#include <linux/raid/raid1.h> 35#include <linux/raid/raid1.h>
36#include <linux/raid/bitmap.h>
37
38#define DEBUG 0
39#if DEBUG
40#define PRINTK(x...) printk(x)
41#else
42#define PRINTK(x...)
43#endif
26 44
27/* 45/*
28 * Number of guaranteed r1bios in case of extreme VM load: 46 * Number of guaranteed r1bios in case of extreme VM load:
@@ -287,9 +305,11 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
287 /* 305 /*
288 * this branch is our 'one mirror IO has finished' event handler: 306 * this branch is our 'one mirror IO has finished' event handler:
289 */ 307 */
290 if (!uptodate) 308 if (!uptodate) {
291 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 309 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
292 else 310 /* an I/O failed, we can't clear the bitmap */
311 set_bit(R1BIO_Degraded, &r1_bio->state);
312 } else
293 /* 313 /*
294 * Set R1BIO_Uptodate in our master bio, so that 314 * Set R1BIO_Uptodate in our master bio, so that
295 * we will return a good error code for to the higher 315 * we will return a good error code for to the higher
@@ -309,6 +329,10 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
309 * already. 329 * already.
310 */ 330 */
311 if (atomic_dec_and_test(&r1_bio->remaining)) { 331 if (atomic_dec_and_test(&r1_bio->remaining)) {
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state));
312 md_write_end(r1_bio->mddev); 336 md_write_end(r1_bio->mddev);
313 raid_end_bio_io(r1_bio); 337 raid_end_bio_io(r1_bio);
314 } 338 }
@@ -458,7 +482,10 @@ static void unplug_slaves(mddev_t *mddev)
458 482
459static void raid1_unplug(request_queue_t *q) 483static void raid1_unplug(request_queue_t *q)
460{ 484{
461 unplug_slaves(q->queuedata); 485 mddev_t *mddev = q->queuedata;
486
487 unplug_slaves(mddev);
488 md_wakeup_thread(mddev->thread);
462} 489}
463 490
464static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, 491static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -501,16 +528,16 @@ static void device_barrier(conf_t *conf, sector_t sect)
501{ 528{
502 spin_lock_irq(&conf->resync_lock); 529 spin_lock_irq(&conf->resync_lock);
503 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 530 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
504 conf->resync_lock, unplug_slaves(conf->mddev)); 531 conf->resync_lock, raid1_unplug(conf->mddev->queue));
505 532
506 if (!conf->barrier++) { 533 if (!conf->barrier++) {
507 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 534 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
508 conf->resync_lock, unplug_slaves(conf->mddev)); 535 conf->resync_lock, raid1_unplug(conf->mddev->queue));
509 if (conf->nr_pending) 536 if (conf->nr_pending)
510 BUG(); 537 BUG();
511 } 538 }
512 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 539 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
513 conf->resync_lock, unplug_slaves(conf->mddev)); 540 conf->resync_lock, raid1_unplug(conf->mddev->queue));
514 conf->next_resync = sect; 541 conf->next_resync = sect;
515 spin_unlock_irq(&conf->resync_lock); 542 spin_unlock_irq(&conf->resync_lock);
516} 543}
@@ -522,8 +549,12 @@ static int make_request(request_queue_t *q, struct bio * bio)
522 mirror_info_t *mirror; 549 mirror_info_t *mirror;
523 r1bio_t *r1_bio; 550 r1bio_t *r1_bio;
524 struct bio *read_bio; 551 struct bio *read_bio;
525 int i, disks; 552 int i, targets = 0, disks;
526 mdk_rdev_t *rdev; 553 mdk_rdev_t *rdev;
554 struct bitmap *bitmap = mddev->bitmap;
555 unsigned long flags;
556 struct bio_list bl;
557
527 558
528 /* 559 /*
529 * Register the new request and wait if the reconstruction 560 * Register the new request and wait if the reconstruction
@@ -554,7 +585,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
554 585
555 r1_bio->master_bio = bio; 586 r1_bio->master_bio = bio;
556 r1_bio->sectors = bio->bi_size >> 9; 587 r1_bio->sectors = bio->bi_size >> 9;
557 588 r1_bio->state = 0;
558 r1_bio->mddev = mddev; 589 r1_bio->mddev = mddev;
559 r1_bio->sector = bio->bi_sector; 590 r1_bio->sector = bio->bi_sector;
560 591
@@ -597,6 +628,13 @@ static int make_request(request_queue_t *q, struct bio * bio)
597 * bios[x] to bio 628 * bios[x] to bio
598 */ 629 */
599 disks = conf->raid_disks; 630 disks = conf->raid_disks;
631#if 0
632 { static int first=1;
633 if (first) printk("First Write sector %llu disks %d\n",
634 (unsigned long long)r1_bio->sector, disks);
635 first = 0;
636 }
637#endif
600 rcu_read_lock(); 638 rcu_read_lock();
601 for (i = 0; i < disks; i++) { 639 for (i = 0; i < disks; i++) {
602 if ((rdev=conf->mirrors[i].rdev) != NULL && 640 if ((rdev=conf->mirrors[i].rdev) != NULL &&
@@ -607,13 +645,21 @@ static int make_request(request_queue_t *q, struct bio * bio)
607 r1_bio->bios[i] = NULL; 645 r1_bio->bios[i] = NULL;
608 } else 646 } else
609 r1_bio->bios[i] = bio; 647 r1_bio->bios[i] = bio;
648 targets++;
610 } else 649 } else
611 r1_bio->bios[i] = NULL; 650 r1_bio->bios[i] = NULL;
612 } 651 }
613 rcu_read_unlock(); 652 rcu_read_unlock();
614 653
615 atomic_set(&r1_bio->remaining, 1); 654 if (targets < conf->raid_disks) {
655 /* array is degraded, we will not clear the bitmap
656 * on I/O completion (see raid1_end_write_request) */
657 set_bit(R1BIO_Degraded, &r1_bio->state);
658 }
659
660 atomic_set(&r1_bio->remaining, 0);
616 661
662 bio_list_init(&bl);
617 for (i = 0; i < disks; i++) { 663 for (i = 0; i < disks; i++) {
618 struct bio *mbio; 664 struct bio *mbio;
619 if (!r1_bio->bios[i]) 665 if (!r1_bio->bios[i])
@@ -629,14 +675,23 @@ static int make_request(request_queue_t *q, struct bio * bio)
629 mbio->bi_private = r1_bio; 675 mbio->bi_private = r1_bio;
630 676
631 atomic_inc(&r1_bio->remaining); 677 atomic_inc(&r1_bio->remaining);
632 generic_make_request(mbio);
633 }
634 678
635 if (atomic_dec_and_test(&r1_bio->remaining)) { 679 bio_list_add(&bl, mbio);
636 md_write_end(mddev);
637 raid_end_bio_io(r1_bio);
638 } 680 }
639 681
682 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
683 spin_lock_irqsave(&conf->device_lock, flags);
684 bio_list_merge(&conf->pending_bio_list, &bl);
685 bio_list_init(&bl);
686
687 blk_plug_device(mddev->queue);
688 spin_unlock_irqrestore(&conf->device_lock, flags);
689
690#if 0
691 while ((bio = bio_list_pop(&bl)) != NULL)
692 generic_make_request(bio);
693#endif
694
640 return 0; 695 return 0;
641} 696}
642 697
@@ -716,7 +771,7 @@ static void close_sync(conf_t *conf)
716{ 771{
717 spin_lock_irq(&conf->resync_lock); 772 spin_lock_irq(&conf->resync_lock);
718 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 773 wait_event_lock_irq(conf->wait_resume, !conf->barrier,
719 conf->resync_lock, unplug_slaves(conf->mddev)); 774 conf->resync_lock, raid1_unplug(conf->mddev->queue));
720 spin_unlock_irq(&conf->resync_lock); 775 spin_unlock_irq(&conf->resync_lock);
721 776
722 if (conf->barrier) BUG(); 777 if (conf->barrier) BUG();
@@ -830,10 +885,11 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
830 * or re-read if the read failed. 885 * or re-read if the read failed.
831 * We don't do much here, just schedule handling by raid1d 886 * We don't do much here, just schedule handling by raid1d
832 */ 887 */
833 if (!uptodate) 888 if (!uptodate) {
834 md_error(r1_bio->mddev, 889 md_error(r1_bio->mddev,
835 conf->mirrors[r1_bio->read_disk].rdev); 890 conf->mirrors[r1_bio->read_disk].rdev);
836 else 891 set_bit(R1BIO_Degraded, &r1_bio->state);
892 } else
837 set_bit(R1BIO_Uptodate, &r1_bio->state); 893 set_bit(R1BIO_Uptodate, &r1_bio->state);
838 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 894 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
839 reschedule_retry(r1_bio); 895 reschedule_retry(r1_bio);
@@ -857,8 +913,10 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
857 mirror = i; 913 mirror = i;
858 break; 914 break;
859 } 915 }
860 if (!uptodate) 916 if (!uptodate) {
861 md_error(mddev, conf->mirrors[mirror].rdev); 917 md_error(mddev, conf->mirrors[mirror].rdev);
918 set_bit(R1BIO_Degraded, &r1_bio->state);
919 }
862 update_head_pos(mirror, r1_bio); 920 update_head_pos(mirror, r1_bio);
863 921
864 if (atomic_dec_and_test(&r1_bio->remaining)) { 922 if (atomic_dec_and_test(&r1_bio->remaining)) {
@@ -878,6 +936,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
878 936
879 bio = r1_bio->bios[r1_bio->read_disk]; 937 bio = r1_bio->bios[r1_bio->read_disk];
880 938
939/*
940 if (r1_bio->sector == 0) printk("First sync write startss\n");
941*/
881 /* 942 /*
882 * schedule writes 943 * schedule writes
883 */ 944 */
@@ -905,10 +966,12 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
905 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 966 atomic_inc(&conf->mirrors[i].rdev->nr_pending);
906 atomic_inc(&r1_bio->remaining); 967 atomic_inc(&r1_bio->remaining);
907 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 968 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
969
908 generic_make_request(wbio); 970 generic_make_request(wbio);
909 } 971 }
910 972
911 if (atomic_dec_and_test(&r1_bio->remaining)) { 973 if (atomic_dec_and_test(&r1_bio->remaining)) {
974 /* if we're here, all write(s) have completed, so clean up */
912 md_done_sync(mddev, r1_bio->sectors, 1); 975 md_done_sync(mddev, r1_bio->sectors, 1);
913 put_buf(r1_bio); 976 put_buf(r1_bio);
914 } 977 }
@@ -937,6 +1000,26 @@ static void raid1d(mddev_t *mddev)
937 for (;;) { 1000 for (;;) {
938 char b[BDEVNAME_SIZE]; 1001 char b[BDEVNAME_SIZE];
939 spin_lock_irqsave(&conf->device_lock, flags); 1002 spin_lock_irqsave(&conf->device_lock, flags);
1003
1004 if (conf->pending_bio_list.head) {
1005 bio = bio_list_get(&conf->pending_bio_list);
1006 blk_remove_plug(mddev->queue);
1007 spin_unlock_irqrestore(&conf->device_lock, flags);
1008 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1009 if (bitmap_unplug(mddev->bitmap) != 0)
1010 printk("%s: bitmap file write failed!\n", mdname(mddev));
1011
1012 while (bio) { /* submit pending writes */
1013 struct bio *next = bio->bi_next;
1014 bio->bi_next = NULL;
1015 generic_make_request(bio);
1016 bio = next;
1017 }
1018 unplug = 1;
1019
1020 continue;
1021 }
1022
940 if (list_empty(head)) 1023 if (list_empty(head))
941 break; 1024 break;
942 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1025 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
@@ -1020,17 +1103,43 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1020 int disk; 1103 int disk;
1021 int i; 1104 int i;
1022 int write_targets = 0; 1105 int write_targets = 0;
1106 int sync_blocks;
1023 1107
1024 if (!conf->r1buf_pool) 1108 if (!conf->r1buf_pool)
1109 {
1110/*
1111 printk("sync start - bitmap %p\n", mddev->bitmap);
1112*/
1025 if (init_resync(conf)) 1113 if (init_resync(conf))
1026 return 0; 1114 return 0;
1115 }
1027 1116
1028 max_sector = mddev->size << 1; 1117 max_sector = mddev->size << 1;
1029 if (sector_nr >= max_sector) { 1118 if (sector_nr >= max_sector) {
1119 /* If we aborted, we need to abort the
1120 * sync on the 'current' bitmap chunk (there will
1121 * only be one in raid1 resync.
1122 * We can find the current addess in mddev->curr_resync
1123 */
1124 if (!conf->fullsync) {
1125 if (mddev->curr_resync < max_sector)
1126 bitmap_end_sync(mddev->bitmap,
1127 mddev->curr_resync,
1128 &sync_blocks, 1);
1129 bitmap_close_sync(mddev->bitmap);
1130 }
1131 if (mddev->curr_resync >= max_sector)
1132 conf->fullsync = 0;
1030 close_sync(conf); 1133 close_sync(conf);
1031 return 0; 1134 return 0;
1032 } 1135 }
1033 1136
1137 if (!conf->fullsync &&
1138 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) {
1139 /* We can skip this block, and probably several more */
1140 *skipped = 1;
1141 return sync_blocks;
1142 }
1034 /* 1143 /*
1035 * If there is non-resync activity waiting for us then 1144 * If there is non-resync activity waiting for us then
1036 * put in a delay to throttle resync. 1145 * put in a delay to throttle resync.
@@ -1069,6 +1178,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1069 1178
1070 r1_bio->mddev = mddev; 1179 r1_bio->mddev = mddev;
1071 r1_bio->sector = sector_nr; 1180 r1_bio->sector = sector_nr;
1181 r1_bio->state = 0;
1072 set_bit(R1BIO_IsSync, &r1_bio->state); 1182 set_bit(R1BIO_IsSync, &r1_bio->state);
1073 r1_bio->read_disk = disk; 1183 r1_bio->read_disk = disk;
1074 1184
@@ -1103,6 +1213,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1103 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1213 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1104 bio->bi_private = r1_bio; 1214 bio->bi_private = r1_bio;
1105 } 1215 }
1216
1217 if (write_targets + 1 < conf->raid_disks)
1218 /* array degraded, can't clear bitmap */
1219 set_bit(R1BIO_Degraded, &r1_bio->state);
1220
1106 if (write_targets == 0) { 1221 if (write_targets == 0) {
1107 /* There is nowhere to write, so all non-sync 1222 /* There is nowhere to write, so all non-sync
1108 * drives must be failed - so we are finished 1223 * drives must be failed - so we are finished
@@ -1122,6 +1237,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1122 len = (max_sector - sector_nr) << 9; 1237 len = (max_sector - sector_nr) << 9;
1123 if (len == 0) 1238 if (len == 0)
1124 break; 1239 break;
1240 if (!conf->fullsync && sync_blocks == 0)
1241 if (!bitmap_start_sync(mddev->bitmap,
1242 sector_nr, &sync_blocks))
1243 break;
1244 if (sync_blocks < (PAGE_SIZE>>9))
1245 BUG();
1246 if (len > (sync_blocks<<9)) len = sync_blocks<<9;
1247
1125 for (i=0 ; i < conf->raid_disks; i++) { 1248 for (i=0 ; i < conf->raid_disks; i++) {
1126 bio = r1_bio->bios[i]; 1249 bio = r1_bio->bios[i];
1127 if (bio->bi_end_io) { 1250 if (bio->bi_end_io) {
@@ -1144,6 +1267,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1144 } 1267 }
1145 nr_sectors += len>>9; 1268 nr_sectors += len>>9;
1146 sector_nr += len>>9; 1269 sector_nr += len>>9;
1270 sync_blocks -= (len>>9);
1147 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1271 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1148 bio_full: 1272 bio_full:
1149 bio = r1_bio->bios[disk]; 1273 bio = r1_bio->bios[disk];
@@ -1232,6 +1356,9 @@ static int run(mddev_t *mddev)
1232 init_waitqueue_head(&conf->wait_idle); 1356 init_waitqueue_head(&conf->wait_idle);
1233 init_waitqueue_head(&conf->wait_resume); 1357 init_waitqueue_head(&conf->wait_resume);
1234 1358
1359 bio_list_init(&conf->pending_bio_list);
1360 bio_list_init(&conf->flushing_bio_list);
1361
1235 if (!conf->working_disks) { 1362 if (!conf->working_disks) {
1236 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 1363 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
1237 mdname(mddev)); 1364 mdname(mddev));
@@ -1260,16 +1387,15 @@ static int run(mddev_t *mddev)
1260 conf->last_used = j; 1387 conf->last_used = j;
1261 1388
1262 1389
1263 1390 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
1264 { 1391 if (!mddev->thread) {
1265 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 1392 printk(KERN_ERR
1266 if (!mddev->thread) { 1393 "raid1: couldn't allocate thread for %s\n",
1267 printk(KERN_ERR 1394 mdname(mddev));
1268 "raid1: couldn't allocate thread for %s\n", 1395 goto out_free_conf;
1269 mdname(mddev));
1270 goto out_free_conf;
1271 }
1272 } 1396 }
1397 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1398
1273 printk(KERN_INFO 1399 printk(KERN_INFO
1274 "raid1: raid set %s active with %d out of %d mirrors\n", 1400 "raid1: raid set %s active with %d out of %d mirrors\n",
1275 mdname(mddev), mddev->raid_disks - mddev->degraded, 1401 mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -1394,7 +1520,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1394 spin_lock_irq(&conf->resync_lock); 1520 spin_lock_irq(&conf->resync_lock);
1395 conf->barrier++; 1521 conf->barrier++;
1396 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1522 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1397 conf->resync_lock, unplug_slaves(mddev)); 1523 conf->resync_lock, raid1_unplug(mddev->queue));
1398 spin_unlock_irq(&conf->resync_lock); 1524 spin_unlock_irq(&conf->resync_lock);
1399 1525
1400 /* ok, everything is stopped */ 1526 /* ok, everything is stopped */