aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
commit6d6e352c80f22c446d933ca8103e02bac1f09129 (patch)
tree248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 /drivers/md/raid1.c
parentb4789b8e6be3151a955ade74872822f30e8cd914 (diff)
parent60aaf933854511630e16be4efe0f96485e132de4 (diff)
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown: "Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync" * tag 'md/3.13' of git://neil.brown.name/md: md/raid5: Use conf->device_lock protect changing of multi-thread resources. md/raid5: Before freeing old multi-thread worker, it should flush them. md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE. UAPI: include <asm/byteorder.h> in linux/raid/md_p.h raid1: Rewrite the implementation of iobarrier. raid1: Add some macros to make code clearly. raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array. raid1: Add a field array_frozen to indicate whether raid in freeze state. md: Convert use of typedef ctl_table to struct ctl_table md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes. md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread. md: fix some places where mddev_lock return value is not checked. raid5: Retry R5_ReadNoMerge flag when hit a read error. raid5: relieve lock contention in get_active_stripe() raid5: relieve lock contention in get_active_stripe() wait: add wait_event_cmd() md/raid5.c: add proper locking to error path of raid5_start_reshape. md: fix calculation of stacking limits on level change. raid5: Use slow_path to release stripe when mddev->thread is null
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c162
1 files changed, 132 insertions, 30 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af6681b19776..1e5a540995e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
66 */ 66 */
67static int max_queued_requests = 1024; 67static int max_queued_requests = 1024;
68 68
69static void allow_barrier(struct r1conf *conf); 69static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
70 sector_t bi_sector);
70static void lower_barrier(struct r1conf *conf); 71static void lower_barrier(struct r1conf *conf);
71 72
72static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 73static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
84} 85}
85 86
86#define RESYNC_BLOCK_SIZE (64*1024) 87#define RESYNC_BLOCK_SIZE (64*1024)
87//#define RESYNC_BLOCK_SIZE PAGE_SIZE 88#define RESYNC_DEPTH 32
88#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 89#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
89#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 90#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
90#define RESYNC_WINDOW (2048*1024) 91#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
92#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
93#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
91 94
92static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 95static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
93{ 96{
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
225 struct bio *bio = r1_bio->master_bio; 228 struct bio *bio = r1_bio->master_bio;
226 int done; 229 int done;
227 struct r1conf *conf = r1_bio->mddev->private; 230 struct r1conf *conf = r1_bio->mddev->private;
231 sector_t start_next_window = r1_bio->start_next_window;
232 sector_t bi_sector = bio->bi_sector;
228 233
229 if (bio->bi_phys_segments) { 234 if (bio->bi_phys_segments) {
230 unsigned long flags; 235 unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
232 bio->bi_phys_segments--; 237 bio->bi_phys_segments--;
233 done = (bio->bi_phys_segments == 0); 238 done = (bio->bi_phys_segments == 0);
234 spin_unlock_irqrestore(&conf->device_lock, flags); 239 spin_unlock_irqrestore(&conf->device_lock, flags);
240 /*
241 * make_request() might be waiting for
242 * bi_phys_segments to decrease
243 */
244 wake_up(&conf->wait_barrier);
235 } else 245 } else
236 done = 1; 246 done = 1;
237 247
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
243 * Wake up any possible resync thread that waits for the device 253 * Wake up any possible resync thread that waits for the device
244 * to go idle. 254 * to go idle.
245 */ 255 */
246 allow_barrier(conf); 256 allow_barrier(conf, start_next_window, bi_sector);
247 } 257 }
248} 258}
249 259
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
814 * there is no normal IO happeing. It must arrange to call 824 * there is no normal IO happeing. It must arrange to call
815 * lower_barrier when the particular background IO completes. 825 * lower_barrier when the particular background IO completes.
816 */ 826 */
817#define RESYNC_DEPTH 32
818
819static void raise_barrier(struct r1conf *conf) 827static void raise_barrier(struct r1conf *conf)
820{ 828{
821 spin_lock_irq(&conf->resync_lock); 829 spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
827 /* block any new IO from starting */ 835 /* block any new IO from starting */
828 conf->barrier++; 836 conf->barrier++;
829 837
830 /* Now wait for all pending IO to complete */ 838 /* For these conditions we must wait:
839 * A: while the array is in frozen state
840 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
841 * the max count which allowed.
842 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
843 * next resync will reach to the window which normal bios are
844 * handling.
845 */
831 wait_event_lock_irq(conf->wait_barrier, 846 wait_event_lock_irq(conf->wait_barrier,
832 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 847 !conf->array_frozen &&
848 conf->barrier < RESYNC_DEPTH &&
849 (conf->start_next_window >=
850 conf->next_resync + RESYNC_SECTORS),
833 conf->resync_lock); 851 conf->resync_lock);
834 852
835 spin_unlock_irq(&conf->resync_lock); 853 spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
845 wake_up(&conf->wait_barrier); 863 wake_up(&conf->wait_barrier);
846} 864}
847 865
848static void wait_barrier(struct r1conf *conf) 866static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
849{ 867{
868 bool wait = false;
869
870 if (conf->array_frozen || !bio)
871 wait = true;
872 else if (conf->barrier && bio_data_dir(bio) == WRITE) {
873 if (conf->next_resync < RESYNC_WINDOW_SECTORS)
874 wait = true;
875 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
876 >= bio_end_sector(bio)) ||
877 (conf->next_resync + NEXT_NORMALIO_DISTANCE
878 <= bio->bi_sector))
879 wait = false;
880 else
881 wait = true;
882 }
883
884 return wait;
885}
886
887static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
888{
889 sector_t sector = 0;
890
850 spin_lock_irq(&conf->resync_lock); 891 spin_lock_irq(&conf->resync_lock);
851 if (conf->barrier) { 892 if (need_to_wait_for_sync(conf, bio)) {
852 conf->nr_waiting++; 893 conf->nr_waiting++;
853 /* Wait for the barrier to drop. 894 /* Wait for the barrier to drop.
854 * However if there are already pending 895 * However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
860 * count down. 901 * count down.
861 */ 902 */
862 wait_event_lock_irq(conf->wait_barrier, 903 wait_event_lock_irq(conf->wait_barrier,
863 !conf->barrier || 904 !conf->array_frozen &&
864 (conf->nr_pending && 905 (!conf->barrier ||
906 ((conf->start_next_window <
907 conf->next_resync + RESYNC_SECTORS) &&
865 current->bio_list && 908 current->bio_list &&
866 !bio_list_empty(current->bio_list)), 909 !bio_list_empty(current->bio_list))),
867 conf->resync_lock); 910 conf->resync_lock);
868 conf->nr_waiting--; 911 conf->nr_waiting--;
869 } 912 }
913
914 if (bio && bio_data_dir(bio) == WRITE) {
915 if (conf->next_resync + NEXT_NORMALIO_DISTANCE
916 <= bio->bi_sector) {
917 if (conf->start_next_window == MaxSector)
918 conf->start_next_window =
919 conf->next_resync +
920 NEXT_NORMALIO_DISTANCE;
921
922 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
923 <= bio->bi_sector)
924 conf->next_window_requests++;
925 else
926 conf->current_window_requests++;
927 }
928 if (bio->bi_sector >= conf->start_next_window)
929 sector = conf->start_next_window;
930 }
931
870 conf->nr_pending++; 932 conf->nr_pending++;
871 spin_unlock_irq(&conf->resync_lock); 933 spin_unlock_irq(&conf->resync_lock);
934 return sector;
872} 935}
873 936
874static void allow_barrier(struct r1conf *conf) 937static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
938 sector_t bi_sector)
875{ 939{
876 unsigned long flags; 940 unsigned long flags;
941
877 spin_lock_irqsave(&conf->resync_lock, flags); 942 spin_lock_irqsave(&conf->resync_lock, flags);
878 conf->nr_pending--; 943 conf->nr_pending--;
944 if (start_next_window) {
945 if (start_next_window == conf->start_next_window) {
946 if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
947 <= bi_sector)
948 conf->next_window_requests--;
949 else
950 conf->current_window_requests--;
951 } else
952 conf->current_window_requests--;
953
954 if (!conf->current_window_requests) {
955 if (conf->next_window_requests) {
956 conf->current_window_requests =
957 conf->next_window_requests;
958 conf->next_window_requests = 0;
959 conf->start_next_window +=
960 NEXT_NORMALIO_DISTANCE;
961 } else
962 conf->start_next_window = MaxSector;
963 }
964 }
879 spin_unlock_irqrestore(&conf->resync_lock, flags); 965 spin_unlock_irqrestore(&conf->resync_lock, flags);
880 wake_up(&conf->wait_barrier); 966 wake_up(&conf->wait_barrier);
881} 967}
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
884{ 970{
885 /* stop syncio and normal IO and wait for everything to 971 /* stop syncio and normal IO and wait for everything to
886 * go quite. 972 * go quite.
887 * We increment barrier and nr_waiting, and then 973 * We wait until nr_pending match nr_queued+extra
888 * wait until nr_pending match nr_queued+extra
889 * This is called in the context of one normal IO request 974 * This is called in the context of one normal IO request
890 * that has failed. Thus any sync request that might be pending 975 * that has failed. Thus any sync request that might be pending
891 * will be blocked by nr_pending, and we need to wait for 976 * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
895 * we continue. 980 * we continue.
896 */ 981 */
897 spin_lock_irq(&conf->resync_lock); 982 spin_lock_irq(&conf->resync_lock);
898 conf->barrier++; 983 conf->array_frozen = 1;
899 conf->nr_waiting++;
900 wait_event_lock_irq_cmd(conf->wait_barrier, 984 wait_event_lock_irq_cmd(conf->wait_barrier,
901 conf->nr_pending == conf->nr_queued+extra, 985 conf->nr_pending == conf->nr_queued+extra,
902 conf->resync_lock, 986 conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
907{ 991{
908 /* reverse the effect of the freeze */ 992 /* reverse the effect of the freeze */
909 spin_lock_irq(&conf->resync_lock); 993 spin_lock_irq(&conf->resync_lock);
910 conf->barrier--; 994 conf->array_frozen = 0;
911 conf->nr_waiting--;
912 wake_up(&conf->wait_barrier); 995 wake_up(&conf->wait_barrier);
913 spin_unlock_irq(&conf->resync_lock); 996 spin_unlock_irq(&conf->resync_lock);
914} 997}
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1013 int first_clone; 1096 int first_clone;
1014 int sectors_handled; 1097 int sectors_handled;
1015 int max_sectors; 1098 int max_sectors;
1099 sector_t start_next_window;
1016 1100
1017 /* 1101 /*
1018 * Register the new request and wait if the reconstruction 1102 * Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1042 finish_wait(&conf->wait_barrier, &w); 1126 finish_wait(&conf->wait_barrier, &w);
1043 } 1127 }
1044 1128
1045 wait_barrier(conf); 1129 start_next_window = wait_barrier(conf, bio);
1046 1130
1047 bitmap = mddev->bitmap; 1131 bitmap = mddev->bitmap;
1048 1132
@@ -1163,6 +1247,7 @@ read_again:
1163 1247
1164 disks = conf->raid_disks * 2; 1248 disks = conf->raid_disks * 2;
1165 retry_write: 1249 retry_write:
1250 r1_bio->start_next_window = start_next_window;
1166 blocked_rdev = NULL; 1251 blocked_rdev = NULL;
1167 rcu_read_lock(); 1252 rcu_read_lock();
1168 max_sectors = r1_bio->sectors; 1253 max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
1231 if (unlikely(blocked_rdev)) { 1316 if (unlikely(blocked_rdev)) {
1232 /* Wait for this device to become unblocked */ 1317 /* Wait for this device to become unblocked */
1233 int j; 1318 int j;
1319 sector_t old = start_next_window;
1234 1320
1235 for (j = 0; j < i; j++) 1321 for (j = 0; j < i; j++)
1236 if (r1_bio->bios[j]) 1322 if (r1_bio->bios[j])
1237 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1323 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1238 r1_bio->state = 0; 1324 r1_bio->state = 0;
1239 allow_barrier(conf); 1325 allow_barrier(conf, start_next_window, bio->bi_sector);
1240 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1326 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1241 wait_barrier(conf); 1327 start_next_window = wait_barrier(conf, bio);
1328 /*
1329 * We must make sure the multi r1bios of bio have
1330 * the same value of bi_phys_segments
1331 */
1332 if (bio->bi_phys_segments && old &&
1333 old != start_next_window)
1334 /* Wait for the former r1bio(s) to complete */
1335 wait_event(conf->wait_barrier,
1336 bio->bi_phys_segments == 1);
1242 goto retry_write; 1337 goto retry_write;
1243 } 1338 }
1244 1339
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
1438 1533
1439static void close_sync(struct r1conf *conf) 1534static void close_sync(struct r1conf *conf)
1440{ 1535{
1441 wait_barrier(conf); 1536 wait_barrier(conf, NULL);
1442 allow_barrier(conf); 1537 allow_barrier(conf, 0, 0);
1443 1538
1444 mempool_destroy(conf->r1buf_pool); 1539 mempool_destroy(conf->r1buf_pool);
1445 conf->r1buf_pool = NULL; 1540 conf->r1buf_pool = NULL;
1541
1542 conf->next_resync = 0;
1543 conf->start_next_window = MaxSector;
1446} 1544}
1447 1545
1448static int raid1_spare_active(struct mddev *mddev) 1546static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2714 conf->pending_count = 0; 2812 conf->pending_count = 0;
2715 conf->recovery_disabled = mddev->recovery_disabled - 1; 2813 conf->recovery_disabled = mddev->recovery_disabled - 1;
2716 2814
2815 conf->start_next_window = MaxSector;
2816 conf->current_window_requests = conf->next_window_requests = 0;
2817
2717 err = -EIO; 2818 err = -EIO;
2718 for (i = 0; i < conf->raid_disks * 2; i++) { 2819 for (i = 0; i < conf->raid_disks * 2; i++) {
2719 2820
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
2871 atomic_read(&bitmap->behind_writes) == 0); 2972 atomic_read(&bitmap->behind_writes) == 0);
2872 } 2973 }
2873 2974
2874 raise_barrier(conf); 2975 freeze_array(conf, 0);
2875 lower_barrier(conf); 2976 unfreeze_array(conf);
2876 2977
2877 md_unregister_thread(&mddev->thread); 2978 md_unregister_thread(&mddev->thread);
2878 if (conf->r1bio_pool) 2979 if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
3031 wake_up(&conf->wait_barrier); 3132 wake_up(&conf->wait_barrier);
3032 break; 3133 break;
3033 case 1: 3134 case 1:
3034 raise_barrier(conf); 3135 freeze_array(conf, 0);
3035 break; 3136 break;
3036 case 0: 3137 case 0:
3037 lower_barrier(conf); 3138 unfreeze_array(conf);
3038 break; 3139 break;
3039 } 3140 }
3040} 3141}
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
3051 mddev->new_chunk_sectors = 0; 3152 mddev->new_chunk_sectors = 0;
3052 conf = setup_conf(mddev); 3153 conf = setup_conf(mddev);
3053 if (!IS_ERR(conf)) 3154 if (!IS_ERR(conf))
3054 conf->barrier = 1; 3155 /* Array must appear to be quiesced */
3156 conf->array_frozen = 1;
3055 return conf; 3157 return conf;
3056 } 3158 }
3057 return ERR_PTR(-EINVAL); 3159 return ERR_PTR(-EINVAL);