summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authormajianpeng <majianpeng@gmail.com>2013-11-15 01:55:02 -0500
committerNeilBrown <neilb@suse.de>2013-11-18 23:19:18 -0500
commit79ef3a8aa1cb1523cc231c9a90a278333c21f761 (patch)
treef5895697b8f2153fe8c0af2ff6b90967e7ab6cbb /drivers/md/raid1.c
parent8e005f7c0276317cfa8fcb0291a0df57f9ef832c (diff)
raid1: Rewrite the implementation of iobarrier.
There is an iobarrier in raid1 because of contention between normal IO and resync IO. It suspends all normal IO when resync/recovery happens. However if normal IO is out side the resync window, there is no contention. So this patch changes the barrier mechanism to only block IO that could contend with the resync that is currently happening. We partition the whole space into five parts. |---------|-----------|------------|----------------|-------| start next_resync start_next_window end_window start + RESYNC_WINDOW = next_resync next_resync + NEXT_NORMALIO_DISTANCE = start_next_window start_next_window + NEXT_NORMALIO_DISTANCE = end_window Firstly we introduce some concepts: 1 - RESYNC_WINDOW: For resync, there are 32 resync requests at most at the same time. A sync request is RESYNC_BLOCK_SIZE(64*1024). So the RESYNC_WINDOW is 32 * RESYNC_BLOCK_SIZE, that is 2MB. 2 - NEXT_NORMALIO_DISTANCE: the distance between next_resync and start_next_window. It also indicates the distance between start_next_window and end_window. It is currently 3 * RESYNC_WINDOW_SIZE but could be tuned if this turned out not to be optimal. 3 - next_resync: the next sector at which we will do sync IO. 4 - start: a position which is at most RESYNC_WINDOW before next_resync. 5 - start_next_window: a position which is NEXT_NORMALIO_DISTANCE beyond next_resync. Normal-io after this position doesn't need to wait for resync-io to complete. 6 - end_window: a position which is 2 * NEXT_NORMALIO_DISTANCE beyond next_resync. This also doesn't need to wait, but is counted differently. 7 - current_window_requests: the count of normalIO between start_next_window and end_window. 8 - next_window_requests: the count of normalIO after end_window. NormalIO will be partitioned into four types: NormIO1: the end sector of bio is smaller or equal the start NormIO2: the start sector of bio larger or equal to end_window NormIO3: the start sector of bio larger or equal to start_next_window. NormIO4: the location between start_next_window and end_window |--------|-----------|--------------------|----------------|-------------| | start | next_resync | start_next_window | end_window | NormIO1 NormIO4 NormIO4 NormIO3 NormIO2 For NormIO1, we don't need any io barrier. For NormIO4, we used a similar approach to the original iobarrier mechanism. The normalIO and resyncIO must be kept separate. For NormIO2/3, we add two fields to struct r1conf: "current_window_requests" and "next_window_requests". They indicate the count of active requests in the two window. For these, we don't wait for resync io to complete. For resync action, if there are NormIO4s, we must wait for it. If not, we can proceed. But if resync action reaches start_next_window and current_window_requests > 0 (that is there are NormIO3s), we must wait until the current_window_requests becomes zero. When current_window_requests becomes zero, start_next_window also moves forward. Then current_window_requests will replaced by next_window_requests. There is a problem which when and how to change from NormIO2 to NormIO3. Only then can sync action progress. We add a field in struct r1conf "start_next_window". A: if start_next_window == MaxSector, it means there are no NormIO2/3. So start_next_window = next_resync + NEXT_NORMALIO_DISTANCE B: if current_window_requests == 0 && next_window_requests != 0, it means start_next_window move to end_window There is another problem which how to differentiate between old NormIO2(now it is NormIO3) and NormIO2. For example, there are many bios which are NormIO2 and a bio which is NormIO3. NormIO3 firstly completed, so the bios of NormIO2 became NormIO3. We add a field in struct r1bio "start_next_window". This is used to record the position conf->start_next_window when the call to wait_barrier() is made in make_request(). In allow_barrier(), we check the conf->start_next_window. If r1bio->stat_next_window == conf->start_next_window, it means there is no transition between NormIO2 and NormIO3. If r1bio->start_next_window != conf->start_next_window, it mean there was a transition between NormIO2 and NormIO3. There can only have been one transition. So it only means the bio is old NormIO2. For one bio, there may be many r1bio's. So we make sure all the r1bio->start_next_window are the same value. If we met blocked_dev in make_request(), it must call allow_barrier and wait_barrier. So the former and the later value of conf->start_next_window will be change. If there are many r1bio's with differnet start_next_window, for the relevant bio, it depend on the last value of r1bio. It will cause error. To avoid this, we must wait for previous r1bios to complete. Signed-off-by: Jianpeng Ma <majianpeng@gmail.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c128
1 files changed, 115 insertions, 13 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 78da3392f577..d9ee4edd7a53 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
66 */ 66 */
67static int max_queued_requests = 1024; 67static int max_queued_requests = 1024;
68 68
69static void allow_barrier(struct r1conf *conf); 69static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
70 sector_t bi_sector);
70static void lower_barrier(struct r1conf *conf); 71static void lower_barrier(struct r1conf *conf);
71 72
72static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 73static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -227,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
227 struct bio *bio = r1_bio->master_bio; 228 struct bio *bio = r1_bio->master_bio;
228 int done; 229 int done;
229 struct r1conf *conf = r1_bio->mddev->private; 230 struct r1conf *conf = r1_bio->mddev->private;
231 sector_t start_next_window = r1_bio->start_next_window;
232 sector_t bi_sector = bio->bi_sector;
230 233
231 if (bio->bi_phys_segments) { 234 if (bio->bi_phys_segments) {
232 unsigned long flags; 235 unsigned long flags;
@@ -234,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
234 bio->bi_phys_segments--; 237 bio->bi_phys_segments--;
235 done = (bio->bi_phys_segments == 0); 238 done = (bio->bi_phys_segments == 0);
236 spin_unlock_irqrestore(&conf->device_lock, flags); 239 spin_unlock_irqrestore(&conf->device_lock, flags);
240 /*
241 * make_request() might be waiting for
242 * bi_phys_segments to decrease
243 */
244 wake_up(&conf->wait_barrier);
237 } else 245 } else
238 done = 1; 246 done = 1;
239 247
@@ -245,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
245 * Wake up any possible resync thread that waits for the device 253 * Wake up any possible resync thread that waits for the device
246 * to go idle. 254 * to go idle.
247 */ 255 */
248 allow_barrier(conf); 256 allow_barrier(conf, start_next_window, bi_sector);
249 } 257 }
250} 258}
251 259
@@ -827,10 +835,19 @@ static void raise_barrier(struct r1conf *conf)
827 /* block any new IO from starting */ 835 /* block any new IO from starting */
828 conf->barrier++; 836 conf->barrier++;
829 837
830 /* Now wait for all pending IO to complete */ 838 /* For these conditions we must wait:
839 * A: while the array is in frozen state
840 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
841 * the max count which allowed.
842 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
843 * next resync will reach to the window which normal bios are
844 * handling.
845 */
831 wait_event_lock_irq(conf->wait_barrier, 846 wait_event_lock_irq(conf->wait_barrier,
832 !conf->array_frozen && 847 !conf->array_frozen &&
833 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 848 conf->barrier < RESYNC_DEPTH &&
849 (conf->start_next_window >=
850 conf->next_resync + RESYNC_SECTORS),
834 conf->resync_lock); 851 conf->resync_lock);
835 852
836 spin_unlock_irq(&conf->resync_lock); 853 spin_unlock_irq(&conf->resync_lock);
@@ -846,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
846 wake_up(&conf->wait_barrier); 863 wake_up(&conf->wait_barrier);
847} 864}
848 865
849static void wait_barrier(struct r1conf *conf) 866static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
867{
868 bool wait = false;
869
870 if (conf->array_frozen || !bio)
871 wait = true;
872 else if (conf->barrier && bio_data_dir(bio) == WRITE) {
873 if (conf->next_resync < RESYNC_WINDOW_SECTORS)
874 wait = true;
875 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
876 >= bio_end_sector(bio)) ||
877 (conf->next_resync + NEXT_NORMALIO_DISTANCE
878 <= bio->bi_sector))
879 wait = false;
880 else
881 wait = true;
882 }
883
884 return wait;
885}
886
887static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
850{ 888{
889 sector_t sector = 0;
890
851 spin_lock_irq(&conf->resync_lock); 891 spin_lock_irq(&conf->resync_lock);
852 if (conf->barrier) { 892 if (need_to_wait_for_sync(conf, bio)) {
853 conf->nr_waiting++; 893 conf->nr_waiting++;
854 /* Wait for the barrier to drop. 894 /* Wait for the barrier to drop.
855 * However if there are already pending 895 * However if there are already pending
@@ -863,21 +903,65 @@ static void wait_barrier(struct r1conf *conf)
863 wait_event_lock_irq(conf->wait_barrier, 903 wait_event_lock_irq(conf->wait_barrier,
864 !conf->array_frozen && 904 !conf->array_frozen &&
865 (!conf->barrier || 905 (!conf->barrier ||
866 (conf->nr_pending && 906 ((conf->start_next_window <
907 conf->next_resync + RESYNC_SECTORS) &&
867 current->bio_list && 908 current->bio_list &&
868 !bio_list_empty(current->bio_list))), 909 !bio_list_empty(current->bio_list))),
869 conf->resync_lock); 910 conf->resync_lock);
870 conf->nr_waiting--; 911 conf->nr_waiting--;
871 } 912 }
913
914 if (bio && bio_data_dir(bio) == WRITE) {
915 if (conf->next_resync + NEXT_NORMALIO_DISTANCE
916 <= bio->bi_sector) {
917 if (conf->start_next_window == MaxSector)
918 conf->start_next_window =
919 conf->next_resync +
920 NEXT_NORMALIO_DISTANCE;
921
922 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
923 <= bio->bi_sector)
924 conf->next_window_requests++;
925 else
926 conf->current_window_requests++;
927 }
928 if (bio->bi_sector >= conf->start_next_window)
929 sector = conf->start_next_window;
930 }
931
872 conf->nr_pending++; 932 conf->nr_pending++;
873 spin_unlock_irq(&conf->resync_lock); 933 spin_unlock_irq(&conf->resync_lock);
934 return sector;
874} 935}
875 936
876static void allow_barrier(struct r1conf *conf) 937static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
938 sector_t bi_sector)
877{ 939{
878 unsigned long flags; 940 unsigned long flags;
941
879 spin_lock_irqsave(&conf->resync_lock, flags); 942 spin_lock_irqsave(&conf->resync_lock, flags);
880 conf->nr_pending--; 943 conf->nr_pending--;
944 if (start_next_window) {
945 if (start_next_window == conf->start_next_window) {
946 if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
947 <= bi_sector)
948 conf->next_window_requests--;
949 else
950 conf->current_window_requests--;
951 } else
952 conf->current_window_requests--;
953
954 if (!conf->current_window_requests) {
955 if (conf->next_window_requests) {
956 conf->current_window_requests =
957 conf->next_window_requests;
958 conf->next_window_requests = 0;
959 conf->start_next_window +=
960 NEXT_NORMALIO_DISTANCE;
961 } else
962 conf->start_next_window = MaxSector;
963 }
964 }
881 spin_unlock_irqrestore(&conf->resync_lock, flags); 965 spin_unlock_irqrestore(&conf->resync_lock, flags);
882 wake_up(&conf->wait_barrier); 966 wake_up(&conf->wait_barrier);
883} 967}
@@ -1012,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1012 int first_clone; 1096 int first_clone;
1013 int sectors_handled; 1097 int sectors_handled;
1014 int max_sectors; 1098 int max_sectors;
1099 sector_t start_next_window;
1015 1100
1016 /* 1101 /*
1017 * Register the new request and wait if the reconstruction 1102 * Register the new request and wait if the reconstruction
@@ -1041,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1041 finish_wait(&conf->wait_barrier, &w); 1126 finish_wait(&conf->wait_barrier, &w);
1042 } 1127 }
1043 1128
1044 wait_barrier(conf); 1129 start_next_window = wait_barrier(conf, bio);
1045 1130
1046 bitmap = mddev->bitmap; 1131 bitmap = mddev->bitmap;
1047 1132
@@ -1162,6 +1247,7 @@ read_again:
1162 1247
1163 disks = conf->raid_disks * 2; 1248 disks = conf->raid_disks * 2;
1164 retry_write: 1249 retry_write:
1250 r1_bio->start_next_window = start_next_window;
1165 blocked_rdev = NULL; 1251 blocked_rdev = NULL;
1166 rcu_read_lock(); 1252 rcu_read_lock();
1167 max_sectors = r1_bio->sectors; 1253 max_sectors = r1_bio->sectors;
@@ -1230,14 +1316,24 @@ read_again:
1230 if (unlikely(blocked_rdev)) { 1316 if (unlikely(blocked_rdev)) {
1231 /* Wait for this device to become unblocked */ 1317 /* Wait for this device to become unblocked */
1232 int j; 1318 int j;
1319 sector_t old = start_next_window;
1233 1320
1234 for (j = 0; j < i; j++) 1321 for (j = 0; j < i; j++)
1235 if (r1_bio->bios[j]) 1322 if (r1_bio->bios[j])
1236 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1323 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1237 r1_bio->state = 0; 1324 r1_bio->state = 0;
1238 allow_barrier(conf); 1325 allow_barrier(conf, start_next_window, bio->bi_sector);
1239 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1326 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1240 wait_barrier(conf); 1327 start_next_window = wait_barrier(conf, bio);
1328 /*
1329 * We must make sure the multi r1bios of bio have
1330 * the same value of bi_phys_segments
1331 */
1332 if (bio->bi_phys_segments && old &&
1333 old != start_next_window)
1334 /* Wait for the former r1bio(s) to complete */
1335 wait_event(conf->wait_barrier,
1336 bio->bi_phys_segments == 1);
1241 goto retry_write; 1337 goto retry_write;
1242 } 1338 }
1243 1339
@@ -1437,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
1437 1533
1438static void close_sync(struct r1conf *conf) 1534static void close_sync(struct r1conf *conf)
1439{ 1535{
1440 wait_barrier(conf); 1536 wait_barrier(conf, NULL);
1441 allow_barrier(conf); 1537 allow_barrier(conf, 0, 0);
1442 1538
1443 mempool_destroy(conf->r1buf_pool); 1539 mempool_destroy(conf->r1buf_pool);
1444 conf->r1buf_pool = NULL; 1540 conf->r1buf_pool = NULL;
1541
1542 conf->next_resync = 0;
1543 conf->start_next_window = MaxSector;
1445} 1544}
1446 1545
1447static int raid1_spare_active(struct mddev *mddev) 1546static int raid1_spare_active(struct mddev *mddev)
@@ -2713,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2713 conf->pending_count = 0; 2812 conf->pending_count = 0;
2714 conf->recovery_disabled = mddev->recovery_disabled - 1; 2813 conf->recovery_disabled = mddev->recovery_disabled - 1;
2715 2814
2815 conf->start_next_window = MaxSector;
2816 conf->current_window_requests = conf->next_window_requests = 0;
2817
2716 err = -EIO; 2818 err = -EIO;
2717 for (i = 0; i < conf->raid_disks * 2; i++) { 2819 for (i = 0; i < conf->raid_disks * 2; i++) {
2718 2820