aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 17:42:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 17:42:19 -0500
commita682e0035494c449e53a57d039f86f75b9e2fe67 (patch)
tree382d6c2d4729e6ed8f697fd528209a2b4701b618 /drivers/md/raid1.c
parent1802979ab1ee8ec5a72987ad518f5a91bf41cd89 (diff)
parent1ec492232ed659acde8cc00b9ecc7529778e03e1 (diff)
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull md updates from Shaohua Li: "Mainly fixes bugs and improves performance: - Improve scalability for raid1 from Coly - Improve raid5-cache read performance, disk efficiency and IO pattern from Song and me - Fix a race condition of disk hotplug for linear from Coly - A few cleanup patches from Ming and Byungchul - Fix a memory leak from Neil - Fix WRITE SAME IO failure from me - Add doc for raid5-cache from me" * 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (23 commits) md/raid1: fix write behind issues introduced by bio_clone_bioset_partial md/raid1: handle flush request correctly md/linear: shutup lockdep warnning md/raid1: fix a use-after-free bug RAID1: avoid unnecessary spin locks in I/O barrier code RAID1: a new I/O barrier implementation to remove resync window md/raid5: Don't reinvent the wheel but use existing llist API md: fast clone bio in bio_clone_mddev() md: remove unnecessary check on mddev md/raid1: use bio_clone_bioset_partial() in case of write behind md: fail if mddev->bio_set can't be created block: introduce bio_clone_bioset_partial() md: disable WRITE SAME if it fails in underlayer disks md/raid5-cache: exclude reclaiming stripes in reclaim check md/raid5-cache: stripe reclaim only counts valid stripes MD: add doc for raid5-cache Documentation: move MD related doc into a separate dir md: ensure md devices are freed before module is unloaded. md/r5cache: improve journal device efficiency md/r5cache: enable chunk_aligned_read with write back cache ...
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c596
1 files changed, 366 insertions, 230 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 830ff2b20346..7453d94eeed7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -71,9 +71,8 @@
71 */ 71 */
72static int max_queued_requests = 1024; 72static int max_queued_requests = 1024;
73 73
74static void allow_barrier(struct r1conf *conf, sector_t start_next_window, 74static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
75 sector_t bi_sector); 75static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
76static void lower_barrier(struct r1conf *conf);
77 76
78#define raid1_log(md, fmt, args...) \ 77#define raid1_log(md, fmt, args...) \
79 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) 78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
100#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) 99#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
101#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) 100#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
102#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 101#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
103#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
104 102
105static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 103static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
106{ 104{
@@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio)
205static void put_buf(struct r1bio *r1_bio) 203static void put_buf(struct r1bio *r1_bio)
206{ 204{
207 struct r1conf *conf = r1_bio->mddev->private; 205 struct r1conf *conf = r1_bio->mddev->private;
206 sector_t sect = r1_bio->sector;
208 int i; 207 int i;
209 208
210 for (i = 0; i < conf->raid_disks * 2; i++) { 209 for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio)
215 214
216 mempool_free(r1_bio, conf->r1buf_pool); 215 mempool_free(r1_bio, conf->r1buf_pool);
217 216
218 lower_barrier(conf); 217 lower_barrier(conf, sect);
219} 218}
220 219
221static void reschedule_retry(struct r1bio *r1_bio) 220static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
223 unsigned long flags; 222 unsigned long flags;
224 struct mddev *mddev = r1_bio->mddev; 223 struct mddev *mddev = r1_bio->mddev;
225 struct r1conf *conf = mddev->private; 224 struct r1conf *conf = mddev->private;
225 int idx;
226 226
227 idx = sector_to_idx(r1_bio->sector);
227 spin_lock_irqsave(&conf->device_lock, flags); 228 spin_lock_irqsave(&conf->device_lock, flags);
228 list_add(&r1_bio->retry_list, &conf->retry_list); 229 list_add(&r1_bio->retry_list, &conf->retry_list);
229 conf->nr_queued ++; 230 atomic_inc(&conf->nr_queued[idx]);
230 spin_unlock_irqrestore(&conf->device_lock, flags); 231 spin_unlock_irqrestore(&conf->device_lock, flags);
231 232
232 wake_up(&conf->wait_barrier); 233 wake_up(&conf->wait_barrier);
@@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
243 struct bio *bio = r1_bio->master_bio; 244 struct bio *bio = r1_bio->master_bio;
244 int done; 245 int done;
245 struct r1conf *conf = r1_bio->mddev->private; 246 struct r1conf *conf = r1_bio->mddev->private;
246 sector_t start_next_window = r1_bio->start_next_window;
247 sector_t bi_sector = bio->bi_iter.bi_sector; 247 sector_t bi_sector = bio->bi_iter.bi_sector;
248 248
249 if (bio->bi_phys_segments) { 249 if (bio->bi_phys_segments) {
@@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
269 * Wake up any possible resync thread that waits for the device 269 * Wake up any possible resync thread that waits for the device
270 * to go idle. 270 * to go idle.
271 */ 271 */
272 allow_barrier(conf, start_next_window, bi_sector); 272 allow_barrier(conf, bi_sector);
273 } 273 }
274} 274}
275 275
@@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio)
517 bio_put(to_put); 517 bio_put(to_put);
518} 518}
519 519
520static sector_t align_to_barrier_unit_end(sector_t start_sector,
521 sector_t sectors)
522{
523 sector_t len;
524
525 WARN_ON(sectors == 0);
526 /*
527 * len is the number of sectors from start_sector to end of the
528 * barrier unit which start_sector belongs to.
529 */
530 len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
531 start_sector;
532
533 if (len > sectors)
534 len = sectors;
535
536 return len;
537}
538
520/* 539/*
521 * This routine returns the disk from which the requested read should 540 * This routine returns the disk from which the requested read should
522 * be done. There is a per-array 'next expected sequential IO' sector 541 * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf)
813 */ 832 */
814static void raise_barrier(struct r1conf *conf, sector_t sector_nr) 833static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
815{ 834{
835 int idx = sector_to_idx(sector_nr);
836
816 spin_lock_irq(&conf->resync_lock); 837 spin_lock_irq(&conf->resync_lock);
817 838
818 /* Wait until no block IO is waiting */ 839 /* Wait until no block IO is waiting */
819 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 840 wait_event_lock_irq(conf->wait_barrier,
841 !atomic_read(&conf->nr_waiting[idx]),
820 conf->resync_lock); 842 conf->resync_lock);
821 843
822 /* block any new IO from starting */ 844 /* block any new IO from starting */
823 conf->barrier++; 845 atomic_inc(&conf->barrier[idx]);
824 conf->next_resync = sector_nr; 846 /*
847 * In raise_barrier() we firstly increase conf->barrier[idx] then
848 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
849 * increase conf->nr_pending[idx] then check conf->barrier[idx].
850 * A memory barrier here to make sure conf->nr_pending[idx] won't
851 * be fetched before conf->barrier[idx] is increased. Otherwise
852 * there will be a race between raise_barrier() and _wait_barrier().
853 */
854 smp_mb__after_atomic();
825 855
826 /* For these conditions we must wait: 856 /* For these conditions we must wait:
827 * A: while the array is in frozen state 857 * A: while the array is in frozen state
828 * B: while barrier >= RESYNC_DEPTH, meaning resync reach 858 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
829 * the max count which allowed. 859 * existing in corresponding I/O barrier bucket.
830 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning 860 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
831 * next resync will reach to the window which normal bios are 861 * max resync count which allowed on current I/O barrier bucket.
832 * handling.
833 * D: while there are any active requests in the current window.
834 */ 862 */
835 wait_event_lock_irq(conf->wait_barrier, 863 wait_event_lock_irq(conf->wait_barrier,
836 !conf->array_frozen && 864 !conf->array_frozen &&
837 conf->barrier < RESYNC_DEPTH && 865 !atomic_read(&conf->nr_pending[idx]) &&
838 conf->current_window_requests == 0 && 866 atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
839 (conf->start_next_window >=
840 conf->next_resync + RESYNC_SECTORS),
841 conf->resync_lock); 867 conf->resync_lock);
842 868
843 conf->nr_pending++; 869 atomic_inc(&conf->nr_pending[idx]);
844 spin_unlock_irq(&conf->resync_lock); 870 spin_unlock_irq(&conf->resync_lock);
845} 871}
846 872
847static void lower_barrier(struct r1conf *conf) 873static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
848{ 874{
849 unsigned long flags; 875 int idx = sector_to_idx(sector_nr);
850 BUG_ON(conf->barrier <= 0); 876
851 spin_lock_irqsave(&conf->resync_lock, flags); 877 BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
852 conf->barrier--; 878
853 conf->nr_pending--; 879 atomic_dec(&conf->barrier[idx]);
854 spin_unlock_irqrestore(&conf->resync_lock, flags); 880 atomic_dec(&conf->nr_pending[idx]);
855 wake_up(&conf->wait_barrier); 881 wake_up(&conf->wait_barrier);
856} 882}
857 883
858static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) 884static void _wait_barrier(struct r1conf *conf, int idx)
859{ 885{
860 bool wait = false; 886 /*
887 * We need to increase conf->nr_pending[idx] very early here,
888 * then raise_barrier() can be blocked when it waits for
889 * conf->nr_pending[idx] to be 0. Then we can avoid holding
890 * conf->resync_lock when there is no barrier raised in same
891 * barrier unit bucket. Also if the array is frozen, I/O
892 * should be blocked until array is unfrozen.
893 */
894 atomic_inc(&conf->nr_pending[idx]);
895 /*
896 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
897 * check conf->barrier[idx]. In raise_barrier() we firstly increase
898 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
899 * barrier is necessary here to make sure conf->barrier[idx] won't be
900 * fetched before conf->nr_pending[idx] is increased. Otherwise there
901 * will be a race between _wait_barrier() and raise_barrier().
902 */
903 smp_mb__after_atomic();
861 904
862 if (conf->array_frozen || !bio) 905 /*
863 wait = true; 906 * Don't worry about checking two atomic_t variables at same time
864 else if (conf->barrier && bio_data_dir(bio) == WRITE) { 907 * here. If during we check conf->barrier[idx], the array is
865 if ((conf->mddev->curr_resync_completed 908 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
866 >= bio_end_sector(bio)) || 909 * 0, it is safe to return and make the I/O continue. Because the
867 (conf->start_next_window + NEXT_NORMALIO_DISTANCE 910 * array is frozen, all I/O returned here will eventually complete
868 <= bio->bi_iter.bi_sector)) 911 * or be queued, no race will happen. See code comment in
869 wait = false; 912 * frozen_array().
870 else 913 */
871 wait = true; 914 if (!READ_ONCE(conf->array_frozen) &&
872 } 915 !atomic_read(&conf->barrier[idx]))
916 return;
873 917
874 return wait; 918 /*
919 * After holding conf->resync_lock, conf->nr_pending[idx]
920 * should be decreased before waiting for barrier to drop.
921 * Otherwise, we may encounter a race condition because
922 * raise_barrer() might be waiting for conf->nr_pending[idx]
923 * to be 0 at same time.
924 */
925 spin_lock_irq(&conf->resync_lock);
926 atomic_inc(&conf->nr_waiting[idx]);
927 atomic_dec(&conf->nr_pending[idx]);
928 /*
929 * In case freeze_array() is waiting for
930 * get_unqueued_pending() == extra
931 */
932 wake_up(&conf->wait_barrier);
933 /* Wait for the barrier in same barrier unit bucket to drop. */
934 wait_event_lock_irq(conf->wait_barrier,
935 !conf->array_frozen &&
936 !atomic_read(&conf->barrier[idx]),
937 conf->resync_lock);
938 atomic_inc(&conf->nr_pending[idx]);
939 atomic_dec(&conf->nr_waiting[idx]);
940 spin_unlock_irq(&conf->resync_lock);
875} 941}
876 942
877static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) 943static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
878{ 944{
879 sector_t sector = 0; 945 int idx = sector_to_idx(sector_nr);
880 946
881 spin_lock_irq(&conf->resync_lock); 947 /*
882 if (need_to_wait_for_sync(conf, bio)) { 948 * Very similar to _wait_barrier(). The difference is, for read
883 conf->nr_waiting++; 949 * I/O we don't need wait for sync I/O, but if the whole array
884 /* Wait for the barrier to drop. 950 * is frozen, the read I/O still has to wait until the array is
885 * However if there are already pending 951 * unfrozen. Since there is no ordering requirement with
886 * requests (preventing the barrier from 952 * conf->barrier[idx] here, memory barrier is unnecessary as well.
887 * rising completely), and the 953 */
888 * per-process bio queue isn't empty, 954 atomic_inc(&conf->nr_pending[idx]);
889 * then don't wait, as we need to empty
890 * that queue to allow conf->start_next_window
891 * to increase.
892 */
893 raid1_log(conf->mddev, "wait barrier");
894 wait_event_lock_irq(conf->wait_barrier,
895 !conf->array_frozen &&
896 (!conf->barrier ||
897 ((conf->start_next_window <
898 conf->next_resync + RESYNC_SECTORS) &&
899 current->bio_list &&
900 !bio_list_empty(current->bio_list))),
901 conf->resync_lock);
902 conf->nr_waiting--;
903 }
904
905 if (bio && bio_data_dir(bio) == WRITE) {
906 if (bio->bi_iter.bi_sector >= conf->next_resync) {
907 if (conf->start_next_window == MaxSector)
908 conf->start_next_window =
909 conf->next_resync +
910 NEXT_NORMALIO_DISTANCE;
911
912 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
913 <= bio->bi_iter.bi_sector)
914 conf->next_window_requests++;
915 else
916 conf->current_window_requests++;
917 sector = conf->start_next_window;
918 }
919 }
920 955
921 conf->nr_pending++; 956 if (!READ_ONCE(conf->array_frozen))
957 return;
958
959 spin_lock_irq(&conf->resync_lock);
960 atomic_inc(&conf->nr_waiting[idx]);
961 atomic_dec(&conf->nr_pending[idx]);
962 /*
963 * In case freeze_array() is waiting for
964 * get_unqueued_pending() == extra
965 */
966 wake_up(&conf->wait_barrier);
967 /* Wait for array to be unfrozen */
968 wait_event_lock_irq(conf->wait_barrier,
969 !conf->array_frozen,
970 conf->resync_lock);
971 atomic_inc(&conf->nr_pending[idx]);
972 atomic_dec(&conf->nr_waiting[idx]);
922 spin_unlock_irq(&conf->resync_lock); 973 spin_unlock_irq(&conf->resync_lock);
923 return sector;
924} 974}
925 975
926static void allow_barrier(struct r1conf *conf, sector_t start_next_window, 976static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
927 sector_t bi_sector)
928{ 977{
929 unsigned long flags; 978 int idx = sector_to_idx(sector_nr);
930 979
931 spin_lock_irqsave(&conf->resync_lock, flags); 980 _wait_barrier(conf, idx);
932 conf->nr_pending--; 981}
933 if (start_next_window) { 982
934 if (start_next_window == conf->start_next_window) { 983static void wait_all_barriers(struct r1conf *conf)
935 if (conf->start_next_window + NEXT_NORMALIO_DISTANCE 984{
936 <= bi_sector) 985 int idx;
937 conf->next_window_requests--; 986
938 else 987 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
939 conf->current_window_requests--; 988 _wait_barrier(conf, idx);
940 } else 989}
941 conf->current_window_requests--; 990
942 991static void _allow_barrier(struct r1conf *conf, int idx)
943 if (!conf->current_window_requests) { 992{
944 if (conf->next_window_requests) { 993 atomic_dec(&conf->nr_pending[idx]);
945 conf->current_window_requests =
946 conf->next_window_requests;
947 conf->next_window_requests = 0;
948 conf->start_next_window +=
949 NEXT_NORMALIO_DISTANCE;
950 } else
951 conf->start_next_window = MaxSector;
952 }
953 }
954 spin_unlock_irqrestore(&conf->resync_lock, flags);
955 wake_up(&conf->wait_barrier); 994 wake_up(&conf->wait_barrier);
956} 995}
957 996
997static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
998{
999 int idx = sector_to_idx(sector_nr);
1000
1001 _allow_barrier(conf, idx);
1002}
1003
1004static void allow_all_barriers(struct r1conf *conf)
1005{
1006 int idx;
1007
1008 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
1009 _allow_barrier(conf, idx);
1010}
1011
1012/* conf->resync_lock should be held */
1013static int get_unqueued_pending(struct r1conf *conf)
1014{
1015 int idx, ret;
1016
1017 for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
1018 ret += atomic_read(&conf->nr_pending[idx]) -
1019 atomic_read(&conf->nr_queued[idx]);
1020
1021 return ret;
1022}
1023
958static void freeze_array(struct r1conf *conf, int extra) 1024static void freeze_array(struct r1conf *conf, int extra)
959{ 1025{
960 /* stop syncio and normal IO and wait for everything to 1026 /* Stop sync I/O and normal I/O and wait for everything to
961 * go quite. 1027 * go quite.
962 * We wait until nr_pending match nr_queued+extra 1028 * This is called in two situations:
963 * This is called in the context of one normal IO request 1029 * 1) management command handlers (reshape, remove disk, quiesce).
964 * that has failed. Thus any sync request that might be pending 1030 * 2) one normal I/O request failed.
965 * will be blocked by nr_pending, and we need to wait for 1031
966 * pending IO requests to complete or be queued for re-try. 1032 * After array_frozen is set to 1, new sync IO will be blocked at
967 * Thus the number queued (nr_queued) plus this request (extra) 1033 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
968 * must match the number of pending IOs (nr_pending) before 1034 * or wait_read_barrier(). The flying I/Os will either complete or be
969 * we continue. 1035 * queued. When everything goes quite, there are only queued I/Os left.
1036
1037 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
1038 * barrier bucket index which this I/O request hits. When all sync and
1039 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
1040 * of all conf->nr_queued[]. But normal I/O failure is an exception,
1041 * in handle_read_error(), we may call freeze_array() before trying to
1042 * fix the read error. In this case, the error read I/O is not queued,
1043 * so get_unqueued_pending() == 1.
1044 *
1045 * Therefore before this function returns, we need to wait until
1046 * get_unqueued_pendings(conf) gets equal to extra. For
1047 * normal I/O context, extra is 1, in rested situations extra is 0.
970 */ 1048 */
971 spin_lock_irq(&conf->resync_lock); 1049 spin_lock_irq(&conf->resync_lock);
972 conf->array_frozen = 1; 1050 conf->array_frozen = 1;
973 raid1_log(conf->mddev, "wait freeze"); 1051 raid1_log(conf->mddev, "wait freeze");
974 wait_event_lock_irq_cmd(conf->wait_barrier, 1052 wait_event_lock_irq_cmd(
975 conf->nr_pending == conf->nr_queued+extra, 1053 conf->wait_barrier,
976 conf->resync_lock, 1054 get_unqueued_pending(conf) == extra,
977 flush_pending_writes(conf)); 1055 conf->resync_lock,
1056 flush_pending_writes(conf));
978 spin_unlock_irq(&conf->resync_lock); 1057 spin_unlock_irq(&conf->resync_lock);
979} 1058}
980static void unfreeze_array(struct r1conf *conf) 1059static void unfreeze_array(struct r1conf *conf)
@@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf)
982 /* reverse the effect of the freeze */ 1061 /* reverse the effect of the freeze */
983 spin_lock_irq(&conf->resync_lock); 1062 spin_lock_irq(&conf->resync_lock);
984 conf->array_frozen = 0; 1063 conf->array_frozen = 0;
985 wake_up(&conf->wait_barrier);
986 spin_unlock_irq(&conf->resync_lock); 1064 spin_unlock_irq(&conf->resync_lock);
1065 wake_up(&conf->wait_barrier);
987} 1066}
988 1067
989/* duplicate the data pages for behind I/O 1068/* duplicate the data pages for behind I/O
@@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
1070 kfree(plug); 1149 kfree(plug);
1071} 1150}
1072 1151
1073static void raid1_read_request(struct mddev *mddev, struct bio *bio, 1152static inline struct r1bio *
1074 struct r1bio *r1_bio) 1153alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
1154{
1155 struct r1conf *conf = mddev->private;
1156 struct r1bio *r1_bio;
1157
1158 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1159
1160 r1_bio->master_bio = bio;
1161 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1162 r1_bio->state = 0;
1163 r1_bio->mddev = mddev;
1164 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1165
1166 return r1_bio;
1167}
1168
1169static void raid1_read_request(struct mddev *mddev, struct bio *bio)
1075{ 1170{
1076 struct r1conf *conf = mddev->private; 1171 struct r1conf *conf = mddev->private;
1077 struct raid1_info *mirror; 1172 struct raid1_info *mirror;
1173 struct r1bio *r1_bio;
1078 struct bio *read_bio; 1174 struct bio *read_bio;
1079 struct bitmap *bitmap = mddev->bitmap; 1175 struct bitmap *bitmap = mddev->bitmap;
1080 const int op = bio_op(bio); 1176 const int op = bio_op(bio);
@@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
1083 int max_sectors; 1179 int max_sectors;
1084 int rdisk; 1180 int rdisk;
1085 1181
1086 wait_barrier(conf, bio); 1182 /*
1183 * Still need barrier for READ in case that whole
1184 * array is frozen.
1185 */
1186 wait_read_barrier(conf, bio->bi_iter.bi_sector);
1187
1188 r1_bio = alloc_r1bio(mddev, bio, 0);
1087 1189
1190 /*
1191 * We might need to issue multiple reads to different
1192 * devices if there are bad blocks around, so we keep
1193 * track of the number of reads in bio->bi_phys_segments.
1194 * If this is 0, there is only one r1_bio and no locking
1195 * will be needed when requests complete. If it is
1196 * non-zero, then it is the number of not-completed requests.
1197 */
1198 bio->bi_phys_segments = 0;
1199 bio_clear_flag(bio, BIO_SEG_VALID);
1200
1201 /*
1202 * make_request() can abort the operation when read-ahead is being
1203 * used and no empty request is available.
1204 */
1088read_again: 1205read_again:
1089 rdisk = read_balance(conf, r1_bio, &max_sectors); 1206 rdisk = read_balance(conf, r1_bio, &max_sectors);
1090 1207
@@ -1106,9 +1223,8 @@ read_again:
1106 atomic_read(&bitmap->behind_writes) == 0); 1223 atomic_read(&bitmap->behind_writes) == 0);
1107 } 1224 }
1108 r1_bio->read_disk = rdisk; 1225 r1_bio->read_disk = rdisk;
1109 r1_bio->start_next_window = 0;
1110 1226
1111 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1227 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1112 bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, 1228 bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
1113 max_sectors); 1229 max_sectors);
1114 1230
@@ -1151,22 +1267,16 @@ read_again:
1151 */ 1267 */
1152 reschedule_retry(r1_bio); 1268 reschedule_retry(r1_bio);
1153 1269
1154 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1270 r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
1155
1156 r1_bio->master_bio = bio;
1157 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1158 r1_bio->state = 0;
1159 r1_bio->mddev = mddev;
1160 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1161 goto read_again; 1271 goto read_again;
1162 } else 1272 } else
1163 generic_make_request(read_bio); 1273 generic_make_request(read_bio);
1164} 1274}
1165 1275
1166static void raid1_write_request(struct mddev *mddev, struct bio *bio, 1276static void raid1_write_request(struct mddev *mddev, struct bio *bio)
1167 struct r1bio *r1_bio)
1168{ 1277{
1169 struct r1conf *conf = mddev->private; 1278 struct r1conf *conf = mddev->private;
1279 struct r1bio *r1_bio;
1170 int i, disks; 1280 int i, disks;
1171 struct bitmap *bitmap = mddev->bitmap; 1281 struct bitmap *bitmap = mddev->bitmap;
1172 unsigned long flags; 1282 unsigned long flags;
@@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1176 int first_clone; 1286 int first_clone;
1177 int sectors_handled; 1287 int sectors_handled;
1178 int max_sectors; 1288 int max_sectors;
1179 sector_t start_next_window;
1180 1289
1181 /* 1290 /*
1182 * Register the new request and wait if the reconstruction 1291 * Register the new request and wait if the reconstruction
@@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1212 } 1321 }
1213 finish_wait(&conf->wait_barrier, &w); 1322 finish_wait(&conf->wait_barrier, &w);
1214 } 1323 }
1215 start_next_window = wait_barrier(conf, bio); 1324 wait_barrier(conf, bio->bi_iter.bi_sector);
1325
1326 r1_bio = alloc_r1bio(mddev, bio, 0);
1327
1328 /* We might need to issue multiple writes to different
1329 * devices if there are bad blocks around, so we keep
1330 * track of the number of writes in bio->bi_phys_segments.
1331 * If this is 0, there is only one r1_bio and no locking
1332 * will be needed when requests complete. If it is
1333 * non-zero, then it is the number of not-completed requests.
1334 */
1335 bio->bi_phys_segments = 0;
1336 bio_clear_flag(bio, BIO_SEG_VALID);
1216 1337
1217 if (conf->pending_count >= max_queued_requests) { 1338 if (conf->pending_count >= max_queued_requests) {
1218 md_wakeup_thread(mddev->thread); 1339 md_wakeup_thread(mddev->thread);
@@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1233 1354
1234 disks = conf->raid_disks * 2; 1355 disks = conf->raid_disks * 2;
1235 retry_write: 1356 retry_write:
1236 r1_bio->start_next_window = start_next_window;
1237 blocked_rdev = NULL; 1357 blocked_rdev = NULL;
1238 rcu_read_lock(); 1358 rcu_read_lock();
1239 max_sectors = r1_bio->sectors; 1359 max_sectors = r1_bio->sectors;
@@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1300 if (unlikely(blocked_rdev)) { 1420 if (unlikely(blocked_rdev)) {
1301 /* Wait for this device to become unblocked */ 1421 /* Wait for this device to become unblocked */
1302 int j; 1422 int j;
1303 sector_t old = start_next_window;
1304 1423
1305 for (j = 0; j < i; j++) 1424 for (j = 0; j < i; j++)
1306 if (r1_bio->bios[j]) 1425 if (r1_bio->bios[j])
1307 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1426 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1308 r1_bio->state = 0; 1427 r1_bio->state = 0;
1309 allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); 1428 allow_barrier(conf, bio->bi_iter.bi_sector);
1310 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); 1429 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1311 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1430 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1312 start_next_window = wait_barrier(conf, bio); 1431 wait_barrier(conf, bio->bi_iter.bi_sector);
1313 /*
1314 * We must make sure the multi r1bios of bio have
1315 * the same value of bi_phys_segments
1316 */
1317 if (bio->bi_phys_segments && old &&
1318 old != start_next_window)
1319 /* Wait for the former r1bio(s) to complete */
1320 wait_event(conf->wait_barrier,
1321 bio->bi_phys_segments == 1);
1322 goto retry_write; 1432 goto retry_write;
1323 } 1433 }
1324 1434
@@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1341 1451
1342 first_clone = 1; 1452 first_clone = 1;
1343 for (i = 0; i < disks; i++) { 1453 for (i = 0; i < disks; i++) {
1344 struct bio *mbio; 1454 struct bio *mbio = NULL;
1455 sector_t offset;
1345 if (!r1_bio->bios[i]) 1456 if (!r1_bio->bios[i])
1346 continue; 1457 continue;
1347 1458
1348 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1459 offset = r1_bio->sector - bio->bi_iter.bi_sector;
1349 bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
1350 max_sectors);
1351 1460
1352 if (first_clone) { 1461 if (first_clone) {
1353 /* do behind I/O ? 1462 /* do behind I/O ?
@@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1357 if (bitmap && 1466 if (bitmap &&
1358 (atomic_read(&bitmap->behind_writes) 1467 (atomic_read(&bitmap->behind_writes)
1359 < mddev->bitmap_info.max_write_behind) && 1468 < mddev->bitmap_info.max_write_behind) &&
1360 !waitqueue_active(&bitmap->behind_wait)) 1469 !waitqueue_active(&bitmap->behind_wait)) {
1470 mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
1471 mddev->bio_set,
1472 offset << 9,
1473 max_sectors << 9);
1361 alloc_behind_pages(mbio, r1_bio); 1474 alloc_behind_pages(mbio, r1_bio);
1475 }
1362 1476
1363 bitmap_startwrite(bitmap, r1_bio->sector, 1477 bitmap_startwrite(bitmap, r1_bio->sector,
1364 r1_bio->sectors, 1478 r1_bio->sectors,
@@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1366 &r1_bio->state)); 1480 &r1_bio->state));
1367 first_clone = 0; 1481 first_clone = 0;
1368 } 1482 }
1483
1484 if (!mbio) {
1485 if (r1_bio->behind_bvecs)
1486 mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
1487 mddev->bio_set,
1488 offset << 9,
1489 max_sectors << 9);
1490 else {
1491 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1492 bio_trim(mbio, offset, max_sectors);
1493 }
1494 }
1495
1369 if (r1_bio->behind_bvecs) { 1496 if (r1_bio->behind_bvecs) {
1370 struct bio_vec *bvec; 1497 struct bio_vec *bvec;
1371 int j; 1498 int j;
@@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1385 conf->mirrors[i].rdev->data_offset); 1512 conf->mirrors[i].rdev->data_offset);
1386 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1513 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1387 mbio->bi_end_io = raid1_end_write_request; 1514 mbio->bi_end_io = raid1_end_write_request;
1388 mbio->bi_opf = bio_op(bio) | 1515 mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
1389 (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA));
1390 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && 1516 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
1391 !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && 1517 !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
1392 conf->raid_disks - mddev->degraded > 1) 1518 conf->raid_disks - mddev->degraded > 1)
@@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1427 /* We need another r1_bio. It has already been counted 1553 /* We need another r1_bio. It has already been counted
1428 * in bio->bi_phys_segments 1554 * in bio->bi_phys_segments
1429 */ 1555 */
1430 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1556 r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
1431 r1_bio->master_bio = bio;
1432 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1433 r1_bio->state = 0;
1434 r1_bio->mddev = mddev;
1435 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1436 goto retry_write; 1557 goto retry_write;
1437 } 1558 }
1438 1559
@@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1444 1565
1445static void raid1_make_request(struct mddev *mddev, struct bio *bio) 1566static void raid1_make_request(struct mddev *mddev, struct bio *bio)
1446{ 1567{
1447 struct r1conf *conf = mddev->private; 1568 struct bio *split;
1448 struct r1bio *r1_bio; 1569 sector_t sectors;
1449 1570
1450 /* 1571 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1451 * make_request() can abort the operation when read-ahead is being 1572 md_flush_request(mddev, bio);
1452 * used and no empty request is available. 1573 return;
1453 * 1574 }
1454 */
1455 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1456
1457 r1_bio->master_bio = bio;
1458 r1_bio->sectors = bio_sectors(bio);
1459 r1_bio->state = 0;
1460 r1_bio->mddev = mddev;
1461 r1_bio->sector = bio->bi_iter.bi_sector;
1462 1575
1463 /* 1576 /* if bio exceeds barrier unit boundary, split it */
1464 * We might need to issue multiple reads to different devices if there 1577 do {
1465 * are bad blocks around, so we keep track of the number of reads in 1578 sectors = align_to_barrier_unit_end(
1466 * bio->bi_phys_segments. If this is 0, there is only one r1_bio and 1579 bio->bi_iter.bi_sector, bio_sectors(bio));
1467 * no locking will be needed when requests complete. If it is 1580 if (sectors < bio_sectors(bio)) {
1468 * non-zero, then it is the number of not-completed requests. 1581 split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
1469 */ 1582 bio_chain(split, bio);
1470 bio->bi_phys_segments = 0; 1583 } else {
1471 bio_clear_flag(bio, BIO_SEG_VALID); 1584 split = bio;
1585 }
1472 1586
1473 if (bio_data_dir(bio) == READ) 1587 if (bio_data_dir(split) == READ)
1474 raid1_read_request(mddev, bio, r1_bio); 1588 raid1_read_request(mddev, split);
1475 else 1589 else
1476 raid1_write_request(mddev, bio, r1_bio); 1590 raid1_write_request(mddev, split);
1591 } while (split != bio);
1477} 1592}
1478 1593
1479static void raid1_status(struct seq_file *seq, struct mddev *mddev) 1594static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf)
1564 1679
1565static void close_sync(struct r1conf *conf) 1680static void close_sync(struct r1conf *conf)
1566{ 1681{
1567 wait_barrier(conf, NULL); 1682 wait_all_barriers(conf);
1568 allow_barrier(conf, 0, 0); 1683 allow_all_barriers(conf);
1569 1684
1570 mempool_destroy(conf->r1buf_pool); 1685 mempool_destroy(conf->r1buf_pool);
1571 conf->r1buf_pool = NULL; 1686 conf->r1buf_pool = NULL;
1572
1573 spin_lock_irq(&conf->resync_lock);
1574 conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
1575 conf->start_next_window = MaxSector;
1576 conf->current_window_requests +=
1577 conf->next_window_requests;
1578 conf->next_window_requests = 0;
1579 spin_unlock_irq(&conf->resync_lock);
1580} 1687}
1581 1688
1582static int raid1_spare_active(struct mddev *mddev) 1689static int raid1_spare_active(struct mddev *mddev)
@@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
2273 2380
2274 wbio->bi_vcnt = vcnt; 2381 wbio->bi_vcnt = vcnt;
2275 } else { 2382 } else {
2276 wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); 2383 wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
2384 mddev->bio_set);
2277 } 2385 }
2278 2386
2279 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 2387 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
2323 2431
2324static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 2432static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2325{ 2433{
2326 int m; 2434 int m, idx;
2327 bool fail = false; 2435 bool fail = false;
2436
2328 for (m = 0; m < conf->raid_disks * 2 ; m++) 2437 for (m = 0; m < conf->raid_disks * 2 ; m++)
2329 if (r1_bio->bios[m] == IO_MADE_GOOD) { 2438 if (r1_bio->bios[m] == IO_MADE_GOOD) {
2330 struct md_rdev *rdev = conf->mirrors[m].rdev; 2439 struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2350 if (fail) { 2459 if (fail) {
2351 spin_lock_irq(&conf->device_lock); 2460 spin_lock_irq(&conf->device_lock);
2352 list_add(&r1_bio->retry_list, &conf->bio_end_io_list); 2461 list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
2353 conf->nr_queued++; 2462 idx = sector_to_idx(r1_bio->sector);
2463 atomic_inc(&conf->nr_queued[idx]);
2354 spin_unlock_irq(&conf->device_lock); 2464 spin_unlock_irq(&conf->device_lock);
2465 /*
2466 * In case freeze_array() is waiting for condition
2467 * get_unqueued_pending() == extra to be true.
2468 */
2469 wake_up(&conf->wait_barrier);
2355 md_wakeup_thread(conf->mddev->thread); 2470 md_wakeup_thread(conf->mddev->thread);
2356 } else { 2471 } else {
2357 if (test_bit(R1BIO_WriteError, &r1_bio->state)) 2472 if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2411,7 +2526,8 @@ read_more:
2411 const unsigned long do_sync 2526 const unsigned long do_sync
2412 = r1_bio->master_bio->bi_opf & REQ_SYNC; 2527 = r1_bio->master_bio->bi_opf & REQ_SYNC;
2413 r1_bio->read_disk = disk; 2528 r1_bio->read_disk = disk;
2414 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); 2529 bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
2530 mddev->bio_set);
2415 bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, 2531 bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
2416 max_sectors); 2532 max_sectors);
2417 r1_bio->bios[r1_bio->read_disk] = bio; 2533 r1_bio->bios[r1_bio->read_disk] = bio;
@@ -2445,15 +2561,8 @@ read_more:
2445 generic_make_request(bio); 2561 generic_make_request(bio);
2446 bio = NULL; 2562 bio = NULL;
2447 2563
2448 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 2564 r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
2449
2450 r1_bio->master_bio = mbio;
2451 r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
2452 r1_bio->state = 0;
2453 set_bit(R1BIO_ReadError, &r1_bio->state); 2565 set_bit(R1BIO_ReadError, &r1_bio->state);
2454 r1_bio->mddev = mddev;
2455 r1_bio->sector = mbio->bi_iter.bi_sector +
2456 sectors_handled;
2457 2566
2458 goto read_more; 2567 goto read_more;
2459 } else { 2568 } else {
@@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread)
2472 struct r1conf *conf = mddev->private; 2581 struct r1conf *conf = mddev->private;
2473 struct list_head *head = &conf->retry_list; 2582 struct list_head *head = &conf->retry_list;
2474 struct blk_plug plug; 2583 struct blk_plug plug;
2584 int idx;
2475 2585
2476 md_check_recovery(mddev); 2586 md_check_recovery(mddev);
2477 2587
@@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread)
2479 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 2589 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2480 LIST_HEAD(tmp); 2590 LIST_HEAD(tmp);
2481 spin_lock_irqsave(&conf->device_lock, flags); 2591 spin_lock_irqsave(&conf->device_lock, flags);
2482 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 2592 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
2483 while (!list_empty(&conf->bio_end_io_list)) { 2593 list_splice_init(&conf->bio_end_io_list, &tmp);
2484 list_move(conf->bio_end_io_list.prev, &tmp);
2485 conf->nr_queued--;
2486 }
2487 }
2488 spin_unlock_irqrestore(&conf->device_lock, flags); 2594 spin_unlock_irqrestore(&conf->device_lock, flags);
2489 while (!list_empty(&tmp)) { 2595 while (!list_empty(&tmp)) {
2490 r1_bio = list_first_entry(&tmp, struct r1bio, 2596 r1_bio = list_first_entry(&tmp, struct r1bio,
2491 retry_list); 2597 retry_list);
2492 list_del(&r1_bio->retry_list); 2598 list_del(&r1_bio->retry_list);
2599 idx = sector_to_idx(r1_bio->sector);
2600 atomic_dec(&conf->nr_queued[idx]);
2493 if (mddev->degraded) 2601 if (mddev->degraded)
2494 set_bit(R1BIO_Degraded, &r1_bio->state); 2602 set_bit(R1BIO_Degraded, &r1_bio->state);
2495 if (test_bit(R1BIO_WriteError, &r1_bio->state)) 2603 if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread)
2510 } 2618 }
2511 r1_bio = list_entry(head->prev, struct r1bio, retry_list); 2619 r1_bio = list_entry(head->prev, struct r1bio, retry_list);
2512 list_del(head->prev); 2620 list_del(head->prev);
2513 conf->nr_queued--; 2621 idx = sector_to_idx(r1_bio->sector);
2622 atomic_dec(&conf->nr_queued[idx]);
2514 spin_unlock_irqrestore(&conf->device_lock, flags); 2623 spin_unlock_irqrestore(&conf->device_lock, flags);
2515 2624
2516 mddev = r1_bio->mddev; 2625 mddev = r1_bio->mddev;
@@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf)
2549 conf->poolinfo); 2658 conf->poolinfo);
2550 if (!conf->r1buf_pool) 2659 if (!conf->r1buf_pool)
2551 return -ENOMEM; 2660 return -ENOMEM;
2552 conf->next_resync = 0;
2553 return 0; 2661 return 0;
2554} 2662}
2555 2663
@@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2578 int still_degraded = 0; 2686 int still_degraded = 0;
2579 int good_sectors = RESYNC_SECTORS; 2687 int good_sectors = RESYNC_SECTORS;
2580 int min_bad = 0; /* number of sectors that are bad in all devices */ 2688 int min_bad = 0; /* number of sectors that are bad in all devices */
2689 int idx = sector_to_idx(sector_nr);
2581 2690
2582 if (!conf->r1buf_pool) 2691 if (!conf->r1buf_pool)
2583 if (init_resync(conf)) 2692 if (init_resync(conf))
@@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2627 * If there is non-resync activity waiting for a turn, then let it 2736 * If there is non-resync activity waiting for a turn, then let it
2628 * though before starting on this new sync request. 2737 * though before starting on this new sync request.
2629 */ 2738 */
2630 if (conf->nr_waiting) 2739 if (atomic_read(&conf->nr_waiting[idx]))
2631 schedule_timeout_uninterruptible(1); 2740 schedule_timeout_uninterruptible(1);
2632 2741
2633 /* we are incrementing sector_nr below. To be safe, we check against 2742 /* we are incrementing sector_nr below. To be safe, we check against
@@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2654 r1_bio->sector = sector_nr; 2763 r1_bio->sector = sector_nr;
2655 r1_bio->state = 0; 2764 r1_bio->state = 0;
2656 set_bit(R1BIO_IsSync, &r1_bio->state); 2765 set_bit(R1BIO_IsSync, &r1_bio->state);
2766 /* make sure good_sectors won't go across barrier unit boundary */
2767 good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
2657 2768
2658 for (i = 0; i < conf->raid_disks * 2; i++) { 2769 for (i = 0; i < conf->raid_disks * 2; i++) {
2659 struct md_rdev *rdev; 2770 struct md_rdev *rdev;
@@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2884 if (!conf) 2995 if (!conf)
2885 goto abort; 2996 goto abort;
2886 2997
2998 conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
2999 sizeof(atomic_t), GFP_KERNEL);
3000 if (!conf->nr_pending)
3001 goto abort;
3002
3003 conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
3004 sizeof(atomic_t), GFP_KERNEL);
3005 if (!conf->nr_waiting)
3006 goto abort;
3007
3008 conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
3009 sizeof(atomic_t), GFP_KERNEL);
3010 if (!conf->nr_queued)
3011 goto abort;
3012
3013 conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
3014 sizeof(atomic_t), GFP_KERNEL);
3015 if (!conf->barrier)
3016 goto abort;
3017
2887 conf->mirrors = kzalloc(sizeof(struct raid1_info) 3018 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2888 * mddev->raid_disks * 2, 3019 * mddev->raid_disks * 2,
2889 GFP_KERNEL); 3020 GFP_KERNEL);
@@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2939 conf->pending_count = 0; 3070 conf->pending_count = 0;
2940 conf->recovery_disabled = mddev->recovery_disabled - 1; 3071 conf->recovery_disabled = mddev->recovery_disabled - 1;
2941 3072
2942 conf->start_next_window = MaxSector;
2943 conf->current_window_requests = conf->next_window_requests = 0;
2944
2945 err = -EIO; 3073 err = -EIO;
2946 for (i = 0; i < conf->raid_disks * 2; i++) { 3074 for (i = 0; i < conf->raid_disks * 2; i++) {
2947 3075
@@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2984 kfree(conf->mirrors); 3112 kfree(conf->mirrors);
2985 safe_put_page(conf->tmppage); 3113 safe_put_page(conf->tmppage);
2986 kfree(conf->poolinfo); 3114 kfree(conf->poolinfo);
3115 kfree(conf->nr_pending);
3116 kfree(conf->nr_waiting);
3117 kfree(conf->nr_queued);
3118 kfree(conf->barrier);
2987 kfree(conf); 3119 kfree(conf);
2988 } 3120 }
2989 return ERR_PTR(err); 3121 return ERR_PTR(err);
@@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
3085 kfree(conf->mirrors); 3217 kfree(conf->mirrors);
3086 safe_put_page(conf->tmppage); 3218 safe_put_page(conf->tmppage);
3087 kfree(conf->poolinfo); 3219 kfree(conf->poolinfo);
3220 kfree(conf->nr_pending);
3221 kfree(conf->nr_waiting);
3222 kfree(conf->nr_queued);
3223 kfree(conf->barrier);
3088 kfree(conf); 3224 kfree(conf);
3089} 3225}
3090 3226