aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c473
1 files changed, 261 insertions, 212 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ad5c9483bd50..40297fd17f7e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -71,9 +71,8 @@
71 */ 71 */
72static int max_queued_requests = 1024; 72static int max_queued_requests = 1024;
73 73
74static void allow_barrier(struct r1conf *conf, sector_t start_next_window, 74static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
75 sector_t bi_sector); 75static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
76static void lower_barrier(struct r1conf *conf);
77 76
78#define raid1_log(md, fmt, args...) \ 77#define raid1_log(md, fmt, args...) \
79 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) 78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
100#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) 99#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
101#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) 100#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
102#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 101#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
103#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
104 102
105static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 103static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
106{ 104{
@@ -215,7 +213,7 @@ static void put_buf(struct r1bio *r1_bio)
215 213
216 mempool_free(r1_bio, conf->r1buf_pool); 214 mempool_free(r1_bio, conf->r1buf_pool);
217 215
218 lower_barrier(conf); 216 lower_barrier(conf, r1_bio->sector);
219} 217}
220 218
221static void reschedule_retry(struct r1bio *r1_bio) 219static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +221,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
223 unsigned long flags; 221 unsigned long flags;
224 struct mddev *mddev = r1_bio->mddev; 222 struct mddev *mddev = r1_bio->mddev;
225 struct r1conf *conf = mddev->private; 223 struct r1conf *conf = mddev->private;
224 int idx;
226 225
226 idx = sector_to_idx(r1_bio->sector);
227 spin_lock_irqsave(&conf->device_lock, flags); 227 spin_lock_irqsave(&conf->device_lock, flags);
228 list_add(&r1_bio->retry_list, &conf->retry_list); 228 list_add(&r1_bio->retry_list, &conf->retry_list);
229 conf->nr_queued ++; 229 conf->nr_queued[idx]++;
230 spin_unlock_irqrestore(&conf->device_lock, flags); 230 spin_unlock_irqrestore(&conf->device_lock, flags);
231 231
232 wake_up(&conf->wait_barrier); 232 wake_up(&conf->wait_barrier);
@@ -243,7 +243,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
243 struct bio *bio = r1_bio->master_bio; 243 struct bio *bio = r1_bio->master_bio;
244 int done; 244 int done;
245 struct r1conf *conf = r1_bio->mddev->private; 245 struct r1conf *conf = r1_bio->mddev->private;
246 sector_t start_next_window = r1_bio->start_next_window;
247 sector_t bi_sector = bio->bi_iter.bi_sector; 246 sector_t bi_sector = bio->bi_iter.bi_sector;
248 247
249 if (bio->bi_phys_segments) { 248 if (bio->bi_phys_segments) {
@@ -269,7 +268,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
269 * Wake up any possible resync thread that waits for the device 268 * Wake up any possible resync thread that waits for the device
270 * to go idle. 269 * to go idle.
271 */ 270 */
272 allow_barrier(conf, start_next_window, bi_sector); 271 allow_barrier(conf, bi_sector);
273 } 272 }
274} 273}
275 274
@@ -517,6 +516,25 @@ static void raid1_end_write_request(struct bio *bio)
517 bio_put(to_put); 516 bio_put(to_put);
518} 517}
519 518
519static sector_t align_to_barrier_unit_end(sector_t start_sector,
520 sector_t sectors)
521{
522 sector_t len;
523
524 WARN_ON(sectors == 0);
525 /*
526 * len is the number of sectors from start_sector to end of the
527 * barrier unit which start_sector belongs to.
528 */
529 len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
530 start_sector;
531
532 if (len > sectors)
533 len = sectors;
534
535 return len;
536}
537
520/* 538/*
521 * This routine returns the disk from which the requested read should 539 * This routine returns the disk from which the requested read should
522 * be done. There is a per-array 'next expected sequential IO' sector 540 * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +831,168 @@ static void flush_pending_writes(struct r1conf *conf)
813 */ 831 */
814static void raise_barrier(struct r1conf *conf, sector_t sector_nr) 832static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
815{ 833{
834 int idx = sector_to_idx(sector_nr);
835
816 spin_lock_irq(&conf->resync_lock); 836 spin_lock_irq(&conf->resync_lock);
817 837
818 /* Wait until no block IO is waiting */ 838 /* Wait until no block IO is waiting */
819 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 839 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx],
820 conf->resync_lock); 840 conf->resync_lock);
821 841
822 /* block any new IO from starting */ 842 /* block any new IO from starting */
823 conf->barrier++; 843 conf->barrier[idx]++;
824 conf->next_resync = sector_nr;
825 844
826 /* For these conditions we must wait: 845 /* For these conditions we must wait:
827 * A: while the array is in frozen state 846 * A: while the array is in frozen state
828 * B: while barrier >= RESYNC_DEPTH, meaning resync reach 847 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
829 * the max count which allowed. 848 * existing in corresponding I/O barrier bucket.
830 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning 849 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
831 * next resync will reach to the window which normal bios are 850 * max resync count which allowed on current I/O barrier bucket.
832 * handling.
833 * D: while there are any active requests in the current window.
834 */ 851 */
835 wait_event_lock_irq(conf->wait_barrier, 852 wait_event_lock_irq(conf->wait_barrier,
836 !conf->array_frozen && 853 !conf->array_frozen &&
837 conf->barrier < RESYNC_DEPTH && 854 !conf->nr_pending[idx] &&
838 conf->current_window_requests == 0 && 855 conf->barrier[idx] < RESYNC_DEPTH,
839 (conf->start_next_window >=
840 conf->next_resync + RESYNC_SECTORS),
841 conf->resync_lock); 856 conf->resync_lock);
842 857
843 conf->nr_pending++; 858 conf->nr_pending[idx]++;
844 spin_unlock_irq(&conf->resync_lock); 859 spin_unlock_irq(&conf->resync_lock);
845} 860}
846 861
847static void lower_barrier(struct r1conf *conf) 862static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
848{ 863{
849 unsigned long flags; 864 unsigned long flags;
850 BUG_ON(conf->barrier <= 0); 865 int idx = sector_to_idx(sector_nr);
866
867 BUG_ON(conf->barrier[idx] <= 0);
868
851 spin_lock_irqsave(&conf->resync_lock, flags); 869 spin_lock_irqsave(&conf->resync_lock, flags);
852 conf->barrier--; 870 conf->barrier[idx]--;
853 conf->nr_pending--; 871 conf->nr_pending[idx]--;
854 spin_unlock_irqrestore(&conf->resync_lock, flags); 872 spin_unlock_irqrestore(&conf->resync_lock, flags);
855 wake_up(&conf->wait_barrier); 873 wake_up(&conf->wait_barrier);
856} 874}
857 875
858static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) 876static void _wait_barrier(struct r1conf *conf, int idx)
859{ 877{
860 bool wait = false; 878 spin_lock_irq(&conf->resync_lock);
861 879 if (conf->array_frozen || conf->barrier[idx]) {
862 if (conf->array_frozen || !bio) 880 conf->nr_waiting[idx]++;
863 wait = true; 881 /* Wait for the barrier to drop. */
864 else if (conf->barrier && bio_data_dir(bio) == WRITE) { 882 wait_event_lock_irq(
865 if ((conf->mddev->curr_resync_completed 883 conf->wait_barrier,
866 >= bio_end_sector(bio)) || 884 !conf->array_frozen && !conf->barrier[idx],
867 (conf->start_next_window + NEXT_NORMALIO_DISTANCE 885 conf->resync_lock);
868 <= bio->bi_iter.bi_sector)) 886 conf->nr_waiting[idx]--;
869 wait = false;
870 else
871 wait = true;
872 } 887 }
873 888
874 return wait; 889 conf->nr_pending[idx]++;
890 spin_unlock_irq(&conf->resync_lock);
875} 891}
876 892
877static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) 893static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
878{ 894{
879 sector_t sector = 0; 895 int idx = sector_to_idx(sector_nr);
880 896
881 spin_lock_irq(&conf->resync_lock); 897 spin_lock_irq(&conf->resync_lock);
882 if (need_to_wait_for_sync(conf, bio)) { 898 if (conf->array_frozen) {
883 conf->nr_waiting++; 899 conf->nr_waiting[idx]++;
884 /* Wait for the barrier to drop. 900 /* Wait for array to unfreeze */
885 * However if there are already pending 901 wait_event_lock_irq(
886 * requests (preventing the barrier from 902 conf->wait_barrier,
887 * rising completely), and the 903 !conf->array_frozen,
888 * per-process bio queue isn't empty, 904 conf->resync_lock);
889 * then don't wait, as we need to empty 905 conf->nr_waiting[idx]--;
890 * that queue to allow conf->start_next_window
891 * to increase.
892 */
893 raid1_log(conf->mddev, "wait barrier");
894 wait_event_lock_irq(conf->wait_barrier,
895 !conf->array_frozen &&
896 (!conf->barrier ||
897 ((conf->start_next_window <
898 conf->next_resync + RESYNC_SECTORS) &&
899 current->bio_list &&
900 !bio_list_empty(current->bio_list))),
901 conf->resync_lock);
902 conf->nr_waiting--;
903 }
904
905 if (bio && bio_data_dir(bio) == WRITE) {
906 if (bio->bi_iter.bi_sector >= conf->next_resync) {
907 if (conf->start_next_window == MaxSector)
908 conf->start_next_window =
909 conf->next_resync +
910 NEXT_NORMALIO_DISTANCE;
911
912 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
913 <= bio->bi_iter.bi_sector)
914 conf->next_window_requests++;
915 else
916 conf->current_window_requests++;
917 sector = conf->start_next_window;
918 }
919 } 906 }
920 907
921 conf->nr_pending++; 908 conf->nr_pending[idx]++;
922 spin_unlock_irq(&conf->resync_lock); 909 spin_unlock_irq(&conf->resync_lock);
923 return sector;
924} 910}
925 911
926static void allow_barrier(struct r1conf *conf, sector_t start_next_window, 912static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
927 sector_t bi_sector) 913{
914 int idx = sector_to_idx(sector_nr);
915
916 _wait_barrier(conf, idx);
917}
918
919static void wait_all_barriers(struct r1conf *conf)
920{
921 int idx;
922
923 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
924 _wait_barrier(conf, idx);
925}
926
927static void _allow_barrier(struct r1conf *conf, int idx)
928{ 928{
929 unsigned long flags; 929 unsigned long flags;
930 930
931 spin_lock_irqsave(&conf->resync_lock, flags); 931 spin_lock_irqsave(&conf->resync_lock, flags);
932 conf->nr_pending--; 932 conf->nr_pending[idx]--;
933 if (start_next_window) {
934 if (start_next_window == conf->start_next_window) {
935 if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
936 <= bi_sector)
937 conf->next_window_requests--;
938 else
939 conf->current_window_requests--;
940 } else
941 conf->current_window_requests--;
942
943 if (!conf->current_window_requests) {
944 if (conf->next_window_requests) {
945 conf->current_window_requests =
946 conf->next_window_requests;
947 conf->next_window_requests = 0;
948 conf->start_next_window +=
949 NEXT_NORMALIO_DISTANCE;
950 } else
951 conf->start_next_window = MaxSector;
952 }
953 }
954 spin_unlock_irqrestore(&conf->resync_lock, flags); 933 spin_unlock_irqrestore(&conf->resync_lock, flags);
955 wake_up(&conf->wait_barrier); 934 wake_up(&conf->wait_barrier);
956} 935}
957 936
937static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
938{
939 int idx = sector_to_idx(sector_nr);
940
941 _allow_barrier(conf, idx);
942}
943
944static void allow_all_barriers(struct r1conf *conf)
945{
946 int idx;
947
948 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
949 _allow_barrier(conf, idx);
950}
951
952/* conf->resync_lock should be held */
953static int get_unqueued_pending(struct r1conf *conf)
954{
955 int idx, ret;
956
957 for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
958 ret += conf->nr_pending[idx] - conf->nr_queued[idx];
959
960 return ret;
961}
962
958static void freeze_array(struct r1conf *conf, int extra) 963static void freeze_array(struct r1conf *conf, int extra)
959{ 964{
960 /* stop syncio and normal IO and wait for everything to 965 /* Stop sync I/O and normal I/O and wait for everything to
961 * go quite. 966 * go quite.
962 * We wait until nr_pending match nr_queued+extra 967 * This is called in two situations:
963 * This is called in the context of one normal IO request 968 * 1) management command handlers (reshape, remove disk, quiesce).
964 * that has failed. Thus any sync request that might be pending 969 * 2) one normal I/O request failed.
965 * will be blocked by nr_pending, and we need to wait for 970
966 * pending IO requests to complete or be queued for re-try. 971 * After array_frozen is set to 1, new sync IO will be blocked at
967 * Thus the number queued (nr_queued) plus this request (extra) 972 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
968 * must match the number of pending IOs (nr_pending) before 973 * or wait_read_barrier(). The flying I/Os will either complete or be
969 * we continue. 974 * queued. When everything goes quite, there are only queued I/Os left.
975
976 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
977 * barrier bucket index which this I/O request hits. When all sync and
978 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
979 * of all conf->nr_queued[]. But normal I/O failure is an exception,
980 * in handle_read_error(), we may call freeze_array() before trying to
981 * fix the read error. In this case, the error read I/O is not queued,
982 * so get_unqueued_pending() == 1.
983 *
984 * Therefore before this function returns, we need to wait until
985 * get_unqueued_pendings(conf) gets equal to extra. For
986 * normal I/O context, extra is 1, in rested situations extra is 0.
970 */ 987 */
971 spin_lock_irq(&conf->resync_lock); 988 spin_lock_irq(&conf->resync_lock);
972 conf->array_frozen = 1; 989 conf->array_frozen = 1;
973 raid1_log(conf->mddev, "wait freeze"); 990 raid1_log(conf->mddev, "wait freeze");
974 wait_event_lock_irq_cmd(conf->wait_barrier, 991 wait_event_lock_irq_cmd(
975 conf->nr_pending == conf->nr_queued+extra, 992 conf->wait_barrier,
976 conf->resync_lock, 993 get_unqueued_pending(conf) == extra,
977 flush_pending_writes(conf)); 994 conf->resync_lock,
995 flush_pending_writes(conf));
978 spin_unlock_irq(&conf->resync_lock); 996 spin_unlock_irq(&conf->resync_lock);
979} 997}
980static void unfreeze_array(struct r1conf *conf) 998static void unfreeze_array(struct r1conf *conf)
@@ -1070,11 +1088,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
1070 kfree(plug); 1088 kfree(plug);
1071} 1089}
1072 1090
1073static void raid1_read_request(struct mddev *mddev, struct bio *bio, 1091static inline struct r1bio *
1074 struct r1bio *r1_bio) 1092alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
1093{
1094 struct r1conf *conf = mddev->private;
1095 struct r1bio *r1_bio;
1096
1097 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1098
1099 r1_bio->master_bio = bio;
1100 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1101 r1_bio->state = 0;
1102 r1_bio->mddev = mddev;
1103 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1104
1105 return r1_bio;
1106}
1107
1108static void raid1_read_request(struct mddev *mddev, struct bio *bio)
1075{ 1109{
1076 struct r1conf *conf = mddev->private; 1110 struct r1conf *conf = mddev->private;
1077 struct raid1_info *mirror; 1111 struct raid1_info *mirror;
1112 struct r1bio *r1_bio;
1078 struct bio *read_bio; 1113 struct bio *read_bio;
1079 struct bitmap *bitmap = mddev->bitmap; 1114 struct bitmap *bitmap = mddev->bitmap;
1080 const int op = bio_op(bio); 1115 const int op = bio_op(bio);
@@ -1083,8 +1118,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
1083 int max_sectors; 1118 int max_sectors;
1084 int rdisk; 1119 int rdisk;
1085 1120
1086 wait_barrier(conf, bio); 1121 /*
1122 * Still need barrier for READ in case that whole
1123 * array is frozen.
1124 */
1125 wait_read_barrier(conf, bio->bi_iter.bi_sector);
1126
1127 r1_bio = alloc_r1bio(mddev, bio, 0);
1087 1128
1129 /*
1130 * We might need to issue multiple reads to different
1131 * devices if there are bad blocks around, so we keep
1132 * track of the number of reads in bio->bi_phys_segments.
1133 * If this is 0, there is only one r1_bio and no locking
1134 * will be needed when requests complete. If it is
1135 * non-zero, then it is the number of not-completed requests.
1136 */
1137 bio->bi_phys_segments = 0;
1138 bio_clear_flag(bio, BIO_SEG_VALID);
1139
1140 /*
1141 * make_request() can abort the operation when read-ahead is being
1142 * used and no empty request is available.
1143 */
1088read_again: 1144read_again:
1089 rdisk = read_balance(conf, r1_bio, &max_sectors); 1145 rdisk = read_balance(conf, r1_bio, &max_sectors);
1090 1146
@@ -1106,7 +1162,6 @@ read_again:
1106 atomic_read(&bitmap->behind_writes) == 0); 1162 atomic_read(&bitmap->behind_writes) == 0);
1107 } 1163 }
1108 r1_bio->read_disk = rdisk; 1164 r1_bio->read_disk = rdisk;
1109 r1_bio->start_next_window = 0;
1110 1165
1111 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); 1166 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1112 bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, 1167 bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
@@ -1151,22 +1206,16 @@ read_again:
1151 */ 1206 */
1152 reschedule_retry(r1_bio); 1207 reschedule_retry(r1_bio);
1153 1208
1154 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1209 r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
1155
1156 r1_bio->master_bio = bio;
1157 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1158 r1_bio->state = 0;
1159 r1_bio->mddev = mddev;
1160 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1161 goto read_again; 1210 goto read_again;
1162 } else 1211 } else
1163 generic_make_request(read_bio); 1212 generic_make_request(read_bio);
1164} 1213}
1165 1214
1166static void raid1_write_request(struct mddev *mddev, struct bio *bio, 1215static void raid1_write_request(struct mddev *mddev, struct bio *bio)
1167 struct r1bio *r1_bio)
1168{ 1216{
1169 struct r1conf *conf = mddev->private; 1217 struct r1conf *conf = mddev->private;
1218 struct r1bio *r1_bio;
1170 int i, disks; 1219 int i, disks;
1171 struct bitmap *bitmap = mddev->bitmap; 1220 struct bitmap *bitmap = mddev->bitmap;
1172 unsigned long flags; 1221 unsigned long flags;
@@ -1180,7 +1229,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1180 int first_clone; 1229 int first_clone;
1181 int sectors_handled; 1230 int sectors_handled;
1182 int max_sectors; 1231 int max_sectors;
1183 sector_t start_next_window;
1184 1232
1185 /* 1233 /*
1186 * Register the new request and wait if the reconstruction 1234 * Register the new request and wait if the reconstruction
@@ -1216,7 +1264,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1216 } 1264 }
1217 finish_wait(&conf->wait_barrier, &w); 1265 finish_wait(&conf->wait_barrier, &w);
1218 } 1266 }
1219 start_next_window = wait_barrier(conf, bio); 1267 wait_barrier(conf, bio->bi_iter.bi_sector);
1268
1269 r1_bio = alloc_r1bio(mddev, bio, 0);
1270
1271 /* We might need to issue multiple writes to different
1272 * devices if there are bad blocks around, so we keep
1273 * track of the number of writes in bio->bi_phys_segments.
1274 * If this is 0, there is only one r1_bio and no locking
1275 * will be needed when requests complete. If it is
1276 * non-zero, then it is the number of not-completed requests.
1277 */
1278 bio->bi_phys_segments = 0;
1279 bio_clear_flag(bio, BIO_SEG_VALID);
1220 1280
1221 if (conf->pending_count >= max_queued_requests) { 1281 if (conf->pending_count >= max_queued_requests) {
1222 md_wakeup_thread(mddev->thread); 1282 md_wakeup_thread(mddev->thread);
@@ -1237,7 +1297,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1237 1297
1238 disks = conf->raid_disks * 2; 1298 disks = conf->raid_disks * 2;
1239 retry_write: 1299 retry_write:
1240 r1_bio->start_next_window = start_next_window;
1241 blocked_rdev = NULL; 1300 blocked_rdev = NULL;
1242 rcu_read_lock(); 1301 rcu_read_lock();
1243 max_sectors = r1_bio->sectors; 1302 max_sectors = r1_bio->sectors;
@@ -1304,25 +1363,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1304 if (unlikely(blocked_rdev)) { 1363 if (unlikely(blocked_rdev)) {
1305 /* Wait for this device to become unblocked */ 1364 /* Wait for this device to become unblocked */
1306 int j; 1365 int j;
1307 sector_t old = start_next_window;
1308 1366
1309 for (j = 0; j < i; j++) 1367 for (j = 0; j < i; j++)
1310 if (r1_bio->bios[j]) 1368 if (r1_bio->bios[j])
1311 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1369 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1312 r1_bio->state = 0; 1370 r1_bio->state = 0;
1313 allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); 1371 allow_barrier(conf, bio->bi_iter.bi_sector);
1314 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); 1372 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1315 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1373 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1316 start_next_window = wait_barrier(conf, bio); 1374 wait_barrier(conf, bio->bi_iter.bi_sector);
1317 /*
1318 * We must make sure the multi r1bios of bio have
1319 * the same value of bi_phys_segments
1320 */
1321 if (bio->bi_phys_segments && old &&
1322 old != start_next_window)
1323 /* Wait for the former r1bio(s) to complete */
1324 wait_event(conf->wait_barrier,
1325 bio->bi_phys_segments == 1);
1326 goto retry_write; 1375 goto retry_write;
1327 } 1376 }
1328 1377
@@ -1440,12 +1489,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1440 /* We need another r1_bio. It has already been counted 1489 /* We need another r1_bio. It has already been counted
1441 * in bio->bi_phys_segments 1490 * in bio->bi_phys_segments
1442 */ 1491 */
1443 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1492 r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
1444 r1_bio->master_bio = bio;
1445 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1446 r1_bio->state = 0;
1447 r1_bio->mddev = mddev;
1448 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1449 goto retry_write; 1493 goto retry_write;
1450 } 1494 }
1451 1495
@@ -1457,36 +1501,25 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1457 1501
1458static void raid1_make_request(struct mddev *mddev, struct bio *bio) 1502static void raid1_make_request(struct mddev *mddev, struct bio *bio)
1459{ 1503{
1460 struct r1conf *conf = mddev->private; 1504 struct bio *split;
1461 struct r1bio *r1_bio; 1505 sector_t sectors;
1462 1506
1463 /* 1507 /* if bio exceeds barrier unit boundary, split it */
1464 * make_request() can abort the operation when read-ahead is being 1508 do {
1465 * used and no empty request is available. 1509 sectors = align_to_barrier_unit_end(
1466 * 1510 bio->bi_iter.bi_sector, bio_sectors(bio));
1467 */ 1511 if (sectors < bio_sectors(bio)) {
1468 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1512 split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
1469 1513 bio_chain(split, bio);
1470 r1_bio->master_bio = bio; 1514 } else {
1471 r1_bio->sectors = bio_sectors(bio); 1515 split = bio;
1472 r1_bio->state = 0; 1516 }
1473 r1_bio->mddev = mddev;
1474 r1_bio->sector = bio->bi_iter.bi_sector;
1475
1476 /*
1477 * We might need to issue multiple reads to different devices if there
1478 * are bad blocks around, so we keep track of the number of reads in
1479 * bio->bi_phys_segments. If this is 0, there is only one r1_bio and
1480 * no locking will be needed when requests complete. If it is
1481 * non-zero, then it is the number of not-completed requests.
1482 */
1483 bio->bi_phys_segments = 0;
1484 bio_clear_flag(bio, BIO_SEG_VALID);
1485 1517
1486 if (bio_data_dir(bio) == READ) 1518 if (bio_data_dir(split) == READ)
1487 raid1_read_request(mddev, bio, r1_bio); 1519 raid1_read_request(mddev, split);
1488 else 1520 else
1489 raid1_write_request(mddev, bio, r1_bio); 1521 raid1_write_request(mddev, split);
1522 } while (split != bio);
1490} 1523}
1491 1524
1492static void raid1_status(struct seq_file *seq, struct mddev *mddev) 1525static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1577,19 +1610,11 @@ static void print_conf(struct r1conf *conf)
1577 1610
1578static void close_sync(struct r1conf *conf) 1611static void close_sync(struct r1conf *conf)
1579{ 1612{
1580 wait_barrier(conf, NULL); 1613 wait_all_barriers(conf);
1581 allow_barrier(conf, 0, 0); 1614 allow_all_barriers(conf);
1582 1615
1583 mempool_destroy(conf->r1buf_pool); 1616 mempool_destroy(conf->r1buf_pool);
1584 conf->r1buf_pool = NULL; 1617 conf->r1buf_pool = NULL;
1585
1586 spin_lock_irq(&conf->resync_lock);
1587 conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
1588 conf->start_next_window = MaxSector;
1589 conf->current_window_requests +=
1590 conf->next_window_requests;
1591 conf->next_window_requests = 0;
1592 spin_unlock_irq(&conf->resync_lock);
1593} 1618}
1594 1619
1595static int raid1_spare_active(struct mddev *mddev) 1620static int raid1_spare_active(struct mddev *mddev)
@@ -2337,8 +2362,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
2337 2362
2338static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 2363static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2339{ 2364{
2340 int m; 2365 int m, idx;
2341 bool fail = false; 2366 bool fail = false;
2367
2342 for (m = 0; m < conf->raid_disks * 2 ; m++) 2368 for (m = 0; m < conf->raid_disks * 2 ; m++)
2343 if (r1_bio->bios[m] == IO_MADE_GOOD) { 2369 if (r1_bio->bios[m] == IO_MADE_GOOD) {
2344 struct md_rdev *rdev = conf->mirrors[m].rdev; 2370 struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2364,7 +2390,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2364 if (fail) { 2390 if (fail) {
2365 spin_lock_irq(&conf->device_lock); 2391 spin_lock_irq(&conf->device_lock);
2366 list_add(&r1_bio->retry_list, &conf->bio_end_io_list); 2392 list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
2367 conf->nr_queued++; 2393 idx = sector_to_idx(r1_bio->sector);
2394 conf->nr_queued[idx]++;
2368 spin_unlock_irq(&conf->device_lock); 2395 spin_unlock_irq(&conf->device_lock);
2369 md_wakeup_thread(conf->mddev->thread); 2396 md_wakeup_thread(conf->mddev->thread);
2370 } else { 2397 } else {
@@ -2460,15 +2487,8 @@ read_more:
2460 generic_make_request(bio); 2487 generic_make_request(bio);
2461 bio = NULL; 2488 bio = NULL;
2462 2489
2463 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 2490 r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
2464
2465 r1_bio->master_bio = mbio;
2466 r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
2467 r1_bio->state = 0;
2468 set_bit(R1BIO_ReadError, &r1_bio->state); 2491 set_bit(R1BIO_ReadError, &r1_bio->state);
2469 r1_bio->mddev = mddev;
2470 r1_bio->sector = mbio->bi_iter.bi_sector +
2471 sectors_handled;
2472 2492
2473 goto read_more; 2493 goto read_more;
2474 } else { 2494 } else {
@@ -2487,6 +2507,7 @@ static void raid1d(struct md_thread *thread)
2487 struct r1conf *conf = mddev->private; 2507 struct r1conf *conf = mddev->private;
2488 struct list_head *head = &conf->retry_list; 2508 struct list_head *head = &conf->retry_list;
2489 struct blk_plug plug; 2509 struct blk_plug plug;
2510 int idx;
2490 2511
2491 md_check_recovery(mddev); 2512 md_check_recovery(mddev);
2492 2513
@@ -2494,17 +2515,17 @@ static void raid1d(struct md_thread *thread)
2494 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 2515 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2495 LIST_HEAD(tmp); 2516 LIST_HEAD(tmp);
2496 spin_lock_irqsave(&conf->device_lock, flags); 2517 spin_lock_irqsave(&conf->device_lock, flags);
2497 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 2518 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
2498 while (!list_empty(&conf->bio_end_io_list)) { 2519 list_splice_init(&conf->bio_end_io_list, &tmp);
2499 list_move(conf->bio_end_io_list.prev, &tmp);
2500 conf->nr_queued--;
2501 }
2502 }
2503 spin_unlock_irqrestore(&conf->device_lock, flags); 2520 spin_unlock_irqrestore(&conf->device_lock, flags);
2504 while (!list_empty(&tmp)) { 2521 while (!list_empty(&tmp)) {
2505 r1_bio = list_first_entry(&tmp, struct r1bio, 2522 r1_bio = list_first_entry(&tmp, struct r1bio,
2506 retry_list); 2523 retry_list);
2507 list_del(&r1_bio->retry_list); 2524 list_del(&r1_bio->retry_list);
2525 idx = sector_to_idx(r1_bio->sector);
2526 spin_lock_irqsave(&conf->device_lock, flags);
2527 conf->nr_queued[idx]--;
2528 spin_unlock_irqrestore(&conf->device_lock, flags);
2508 if (mddev->degraded) 2529 if (mddev->degraded)
2509 set_bit(R1BIO_Degraded, &r1_bio->state); 2530 set_bit(R1BIO_Degraded, &r1_bio->state);
2510 if (test_bit(R1BIO_WriteError, &r1_bio->state)) 2531 if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2525,7 +2546,8 @@ static void raid1d(struct md_thread *thread)
2525 } 2546 }
2526 r1_bio = list_entry(head->prev, struct r1bio, retry_list); 2547 r1_bio = list_entry(head->prev, struct r1bio, retry_list);
2527 list_del(head->prev); 2548 list_del(head->prev);
2528 conf->nr_queued--; 2549 idx = sector_to_idx(r1_bio->sector);
2550 conf->nr_queued[idx]--;
2529 spin_unlock_irqrestore(&conf->device_lock, flags); 2551 spin_unlock_irqrestore(&conf->device_lock, flags);
2530 2552
2531 mddev = r1_bio->mddev; 2553 mddev = r1_bio->mddev;
@@ -2564,7 +2586,6 @@ static int init_resync(struct r1conf *conf)
2564 conf->poolinfo); 2586 conf->poolinfo);
2565 if (!conf->r1buf_pool) 2587 if (!conf->r1buf_pool)
2566 return -ENOMEM; 2588 return -ENOMEM;
2567 conf->next_resync = 0;
2568 return 0; 2589 return 0;
2569} 2590}
2570 2591
@@ -2593,6 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2593 int still_degraded = 0; 2614 int still_degraded = 0;
2594 int good_sectors = RESYNC_SECTORS; 2615 int good_sectors = RESYNC_SECTORS;
2595 int min_bad = 0; /* number of sectors that are bad in all devices */ 2616 int min_bad = 0; /* number of sectors that are bad in all devices */
2617 int idx = sector_to_idx(sector_nr);
2596 2618
2597 if (!conf->r1buf_pool) 2619 if (!conf->r1buf_pool)
2598 if (init_resync(conf)) 2620 if (init_resync(conf))
@@ -2642,7 +2664,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2642 * If there is non-resync activity waiting for a turn, then let it 2664 * If there is non-resync activity waiting for a turn, then let it
2643 * though before starting on this new sync request. 2665 * though before starting on this new sync request.
2644 */ 2666 */
2645 if (conf->nr_waiting) 2667 if (conf->nr_waiting[idx])
2646 schedule_timeout_uninterruptible(1); 2668 schedule_timeout_uninterruptible(1);
2647 2669
2648 /* we are incrementing sector_nr below. To be safe, we check against 2670 /* we are incrementing sector_nr below. To be safe, we check against
@@ -2669,6 +2691,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2669 r1_bio->sector = sector_nr; 2691 r1_bio->sector = sector_nr;
2670 r1_bio->state = 0; 2692 r1_bio->state = 0;
2671 set_bit(R1BIO_IsSync, &r1_bio->state); 2693 set_bit(R1BIO_IsSync, &r1_bio->state);
2694 /* make sure good_sectors won't go across barrier unit boundary */
2695 good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
2672 2696
2673 for (i = 0; i < conf->raid_disks * 2; i++) { 2697 for (i = 0; i < conf->raid_disks * 2; i++) {
2674 struct md_rdev *rdev; 2698 struct md_rdev *rdev;
@@ -2899,6 +2923,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2899 if (!conf) 2923 if (!conf)
2900 goto abort; 2924 goto abort;
2901 2925
2926 conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
2927 sizeof(int), GFP_KERNEL);
2928 if (!conf->nr_pending)
2929 goto abort;
2930
2931 conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
2932 sizeof(int), GFP_KERNEL);
2933 if (!conf->nr_waiting)
2934 goto abort;
2935
2936 conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
2937 sizeof(int), GFP_KERNEL);
2938 if (!conf->nr_queued)
2939 goto abort;
2940
2941 conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
2942 sizeof(int), GFP_KERNEL);
2943 if (!conf->barrier)
2944 goto abort;
2945
2902 conf->mirrors = kzalloc(sizeof(struct raid1_info) 2946 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2903 * mddev->raid_disks * 2, 2947 * mddev->raid_disks * 2,
2904 GFP_KERNEL); 2948 GFP_KERNEL);
@@ -2954,9 +2998,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2954 conf->pending_count = 0; 2998 conf->pending_count = 0;
2955 conf->recovery_disabled = mddev->recovery_disabled - 1; 2999 conf->recovery_disabled = mddev->recovery_disabled - 1;
2956 3000
2957 conf->start_next_window = MaxSector;
2958 conf->current_window_requests = conf->next_window_requests = 0;
2959
2960 err = -EIO; 3001 err = -EIO;
2961 for (i = 0; i < conf->raid_disks * 2; i++) { 3002 for (i = 0; i < conf->raid_disks * 2; i++) {
2962 3003
@@ -2999,6 +3040,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2999 kfree(conf->mirrors); 3040 kfree(conf->mirrors);
3000 safe_put_page(conf->tmppage); 3041 safe_put_page(conf->tmppage);
3001 kfree(conf->poolinfo); 3042 kfree(conf->poolinfo);
3043 kfree(conf->nr_pending);
3044 kfree(conf->nr_waiting);
3045 kfree(conf->nr_queued);
3046 kfree(conf->barrier);
3002 kfree(conf); 3047 kfree(conf);
3003 } 3048 }
3004 return ERR_PTR(err); 3049 return ERR_PTR(err);
@@ -3100,6 +3145,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
3100 kfree(conf->mirrors); 3145 kfree(conf->mirrors);
3101 safe_put_page(conf->tmppage); 3146 safe_put_page(conf->tmppage);
3102 kfree(conf->poolinfo); 3147 kfree(conf->poolinfo);
3148 kfree(conf->nr_pending);
3149 kfree(conf->nr_waiting);
3150 kfree(conf->nr_queued);
3151 kfree(conf->barrier);
3103 kfree(conf); 3152 kfree(conf);
3104} 3153}
3105 3154