diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-24 17:42:19 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-24 17:42:19 -0500 |
commit | a682e0035494c449e53a57d039f86f75b9e2fe67 (patch) | |
tree | 382d6c2d4729e6ed8f697fd528209a2b4701b618 /drivers/md/raid1.c | |
parent | 1802979ab1ee8ec5a72987ad518f5a91bf41cd89 (diff) | |
parent | 1ec492232ed659acde8cc00b9ecc7529778e03e1 (diff) |
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull md updates from Shaohua Li:
"Mainly fixes bugs and improves performance:
- Improve scalability for raid1 from Coly
- Improve raid5-cache read performance, disk efficiency and IO
pattern from Song and me
- Fix a race condition of disk hotplug for linear from Coly
- A few cleanup patches from Ming and Byungchul
- Fix a memory leak from Neil
- Fix WRITE SAME IO failure from me
- Add doc for raid5-cache from me"
* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (23 commits)
md/raid1: fix write behind issues introduced by bio_clone_bioset_partial
md/raid1: handle flush request correctly
md/linear: shutup lockdep warnning
md/raid1: fix a use-after-free bug
RAID1: avoid unnecessary spin locks in I/O barrier code
RAID1: a new I/O barrier implementation to remove resync window
md/raid5: Don't reinvent the wheel but use existing llist API
md: fast clone bio in bio_clone_mddev()
md: remove unnecessary check on mddev
md/raid1: use bio_clone_bioset_partial() in case of write behind
md: fail if mddev->bio_set can't be created
block: introduce bio_clone_bioset_partial()
md: disable WRITE SAME if it fails in underlayer disks
md/raid5-cache: exclude reclaiming stripes in reclaim check
md/raid5-cache: stripe reclaim only counts valid stripes
MD: add doc for raid5-cache
Documentation: move MD related doc into a separate dir
md: ensure md devices are freed before module is unloaded.
md/r5cache: improve journal device efficiency
md/r5cache: enable chunk_aligned_read with write back cache
...
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 596 |
1 files changed, 366 insertions, 230 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 830ff2b20346..7453d94eeed7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -71,9 +71,8 @@ | |||
71 | */ | 71 | */ |
72 | static int max_queued_requests = 1024; | 72 | static int max_queued_requests = 1024; |
73 | 73 | ||
74 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | 74 | static void allow_barrier(struct r1conf *conf, sector_t sector_nr); |
75 | sector_t bi_sector); | 75 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr); |
76 | static void lower_barrier(struct r1conf *conf); | ||
77 | 76 | ||
78 | #define raid1_log(md, fmt, args...) \ | 77 | #define raid1_log(md, fmt, args...) \ |
79 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) | 78 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) |
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) | |||
100 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | 99 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) |
101 | #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) | 100 | #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) |
102 | #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) | 101 | #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) |
103 | #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||
104 | 102 | ||
105 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | 103 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) |
106 | { | 104 | { |
@@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio) | |||
205 | static void put_buf(struct r1bio *r1_bio) | 203 | static void put_buf(struct r1bio *r1_bio) |
206 | { | 204 | { |
207 | struct r1conf *conf = r1_bio->mddev->private; | 205 | struct r1conf *conf = r1_bio->mddev->private; |
206 | sector_t sect = r1_bio->sector; | ||
208 | int i; | 207 | int i; |
209 | 208 | ||
210 | for (i = 0; i < conf->raid_disks * 2; i++) { | 209 | for (i = 0; i < conf->raid_disks * 2; i++) { |
@@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio) | |||
215 | 214 | ||
216 | mempool_free(r1_bio, conf->r1buf_pool); | 215 | mempool_free(r1_bio, conf->r1buf_pool); |
217 | 216 | ||
218 | lower_barrier(conf); | 217 | lower_barrier(conf, sect); |
219 | } | 218 | } |
220 | 219 | ||
221 | static void reschedule_retry(struct r1bio *r1_bio) | 220 | static void reschedule_retry(struct r1bio *r1_bio) |
@@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio) | |||
223 | unsigned long flags; | 222 | unsigned long flags; |
224 | struct mddev *mddev = r1_bio->mddev; | 223 | struct mddev *mddev = r1_bio->mddev; |
225 | struct r1conf *conf = mddev->private; | 224 | struct r1conf *conf = mddev->private; |
225 | int idx; | ||
226 | 226 | ||
227 | idx = sector_to_idx(r1_bio->sector); | ||
227 | spin_lock_irqsave(&conf->device_lock, flags); | 228 | spin_lock_irqsave(&conf->device_lock, flags); |
228 | list_add(&r1_bio->retry_list, &conf->retry_list); | 229 | list_add(&r1_bio->retry_list, &conf->retry_list); |
229 | conf->nr_queued ++; | 230 | atomic_inc(&conf->nr_queued[idx]); |
230 | spin_unlock_irqrestore(&conf->device_lock, flags); | 231 | spin_unlock_irqrestore(&conf->device_lock, flags); |
231 | 232 | ||
232 | wake_up(&conf->wait_barrier); | 233 | wake_up(&conf->wait_barrier); |
@@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
243 | struct bio *bio = r1_bio->master_bio; | 244 | struct bio *bio = r1_bio->master_bio; |
244 | int done; | 245 | int done; |
245 | struct r1conf *conf = r1_bio->mddev->private; | 246 | struct r1conf *conf = r1_bio->mddev->private; |
246 | sector_t start_next_window = r1_bio->start_next_window; | ||
247 | sector_t bi_sector = bio->bi_iter.bi_sector; | 247 | sector_t bi_sector = bio->bi_iter.bi_sector; |
248 | 248 | ||
249 | if (bio->bi_phys_segments) { | 249 | if (bio->bi_phys_segments) { |
@@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
269 | * Wake up any possible resync thread that waits for the device | 269 | * Wake up any possible resync thread that waits for the device |
270 | * to go idle. | 270 | * to go idle. |
271 | */ | 271 | */ |
272 | allow_barrier(conf, start_next_window, bi_sector); | 272 | allow_barrier(conf, bi_sector); |
273 | } | 273 | } |
274 | } | 274 | } |
275 | 275 | ||
@@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio) | |||
517 | bio_put(to_put); | 517 | bio_put(to_put); |
518 | } | 518 | } |
519 | 519 | ||
520 | static sector_t align_to_barrier_unit_end(sector_t start_sector, | ||
521 | sector_t sectors) | ||
522 | { | ||
523 | sector_t len; | ||
524 | |||
525 | WARN_ON(sectors == 0); | ||
526 | /* | ||
527 | * len is the number of sectors from start_sector to end of the | ||
528 | * barrier unit which start_sector belongs to. | ||
529 | */ | ||
530 | len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - | ||
531 | start_sector; | ||
532 | |||
533 | if (len > sectors) | ||
534 | len = sectors; | ||
535 | |||
536 | return len; | ||
537 | } | ||
538 | |||
520 | /* | 539 | /* |
521 | * This routine returns the disk from which the requested read should | 540 | * This routine returns the disk from which the requested read should |
522 | * be done. There is a per-array 'next expected sequential IO' sector | 541 | * be done. There is a per-array 'next expected sequential IO' sector |
@@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf) | |||
813 | */ | 832 | */ |
814 | static void raise_barrier(struct r1conf *conf, sector_t sector_nr) | 833 | static void raise_barrier(struct r1conf *conf, sector_t sector_nr) |
815 | { | 834 | { |
835 | int idx = sector_to_idx(sector_nr); | ||
836 | |||
816 | spin_lock_irq(&conf->resync_lock); | 837 | spin_lock_irq(&conf->resync_lock); |
817 | 838 | ||
818 | /* Wait until no block IO is waiting */ | 839 | /* Wait until no block IO is waiting */ |
819 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | 840 | wait_event_lock_irq(conf->wait_barrier, |
841 | !atomic_read(&conf->nr_waiting[idx]), | ||
820 | conf->resync_lock); | 842 | conf->resync_lock); |
821 | 843 | ||
822 | /* block any new IO from starting */ | 844 | /* block any new IO from starting */ |
823 | conf->barrier++; | 845 | atomic_inc(&conf->barrier[idx]); |
824 | conf->next_resync = sector_nr; | 846 | /* |
847 | * In raise_barrier() we firstly increase conf->barrier[idx] then | ||
848 | * check conf->nr_pending[idx]. In _wait_barrier() we firstly | ||
849 | * increase conf->nr_pending[idx] then check conf->barrier[idx]. | ||
850 | * A memory barrier here to make sure conf->nr_pending[idx] won't | ||
851 | * be fetched before conf->barrier[idx] is increased. Otherwise | ||
852 | * there will be a race between raise_barrier() and _wait_barrier(). | ||
853 | */ | ||
854 | smp_mb__after_atomic(); | ||
825 | 855 | ||
826 | /* For these conditions we must wait: | 856 | /* For these conditions we must wait: |
827 | * A: while the array is in frozen state | 857 | * A: while the array is in frozen state |
828 | * B: while barrier >= RESYNC_DEPTH, meaning resync reach | 858 | * B: while conf->nr_pending[idx] is not 0, meaning regular I/O |
829 | * the max count which allowed. | 859 | * existing in corresponding I/O barrier bucket. |
830 | * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | 860 | * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches |
831 | * next resync will reach to the window which normal bios are | 861 | * max resync count which allowed on current I/O barrier bucket. |
832 | * handling. | ||
833 | * D: while there are any active requests in the current window. | ||
834 | */ | 862 | */ |
835 | wait_event_lock_irq(conf->wait_barrier, | 863 | wait_event_lock_irq(conf->wait_barrier, |
836 | !conf->array_frozen && | 864 | !conf->array_frozen && |
837 | conf->barrier < RESYNC_DEPTH && | 865 | !atomic_read(&conf->nr_pending[idx]) && |
838 | conf->current_window_requests == 0 && | 866 | atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, |
839 | (conf->start_next_window >= | ||
840 | conf->next_resync + RESYNC_SECTORS), | ||
841 | conf->resync_lock); | 867 | conf->resync_lock); |
842 | 868 | ||
843 | conf->nr_pending++; | 869 | atomic_inc(&conf->nr_pending[idx]); |
844 | spin_unlock_irq(&conf->resync_lock); | 870 | spin_unlock_irq(&conf->resync_lock); |
845 | } | 871 | } |
846 | 872 | ||
847 | static void lower_barrier(struct r1conf *conf) | 873 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr) |
848 | { | 874 | { |
849 | unsigned long flags; | 875 | int idx = sector_to_idx(sector_nr); |
850 | BUG_ON(conf->barrier <= 0); | 876 | |
851 | spin_lock_irqsave(&conf->resync_lock, flags); | 877 | BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); |
852 | conf->barrier--; | 878 | |
853 | conf->nr_pending--; | 879 | atomic_dec(&conf->barrier[idx]); |
854 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 880 | atomic_dec(&conf->nr_pending[idx]); |
855 | wake_up(&conf->wait_barrier); | 881 | wake_up(&conf->wait_barrier); |
856 | } | 882 | } |
857 | 883 | ||
858 | static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | 884 | static void _wait_barrier(struct r1conf *conf, int idx) |
859 | { | 885 | { |
860 | bool wait = false; | 886 | /* |
887 | * We need to increase conf->nr_pending[idx] very early here, | ||
888 | * then raise_barrier() can be blocked when it waits for | ||
889 | * conf->nr_pending[idx] to be 0. Then we can avoid holding | ||
890 | * conf->resync_lock when there is no barrier raised in same | ||
891 | * barrier unit bucket. Also if the array is frozen, I/O | ||
892 | * should be blocked until array is unfrozen. | ||
893 | */ | ||
894 | atomic_inc(&conf->nr_pending[idx]); | ||
895 | /* | ||
896 | * In _wait_barrier() we firstly increase conf->nr_pending[idx], then | ||
897 | * check conf->barrier[idx]. In raise_barrier() we firstly increase | ||
898 | * conf->barrier[idx], then check conf->nr_pending[idx]. A memory | ||
899 | * barrier is necessary here to make sure conf->barrier[idx] won't be | ||
900 | * fetched before conf->nr_pending[idx] is increased. Otherwise there | ||
901 | * will be a race between _wait_barrier() and raise_barrier(). | ||
902 | */ | ||
903 | smp_mb__after_atomic(); | ||
861 | 904 | ||
862 | if (conf->array_frozen || !bio) | 905 | /* |
863 | wait = true; | 906 | * Don't worry about checking two atomic_t variables at same time |
864 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | 907 | * here. If during we check conf->barrier[idx], the array is |
865 | if ((conf->mddev->curr_resync_completed | 908 | * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is |
866 | >= bio_end_sector(bio)) || | 909 | * 0, it is safe to return and make the I/O continue. Because the |
867 | (conf->start_next_window + NEXT_NORMALIO_DISTANCE | 910 | * array is frozen, all I/O returned here will eventually complete |
868 | <= bio->bi_iter.bi_sector)) | 911 | * or be queued, no race will happen. See code comment in |
869 | wait = false; | 912 | * frozen_array(). |
870 | else | 913 | */ |
871 | wait = true; | 914 | if (!READ_ONCE(conf->array_frozen) && |
872 | } | 915 | !atomic_read(&conf->barrier[idx])) |
916 | return; | ||
873 | 917 | ||
874 | return wait; | 918 | /* |
919 | * After holding conf->resync_lock, conf->nr_pending[idx] | ||
920 | * should be decreased before waiting for barrier to drop. | ||
921 | * Otherwise, we may encounter a race condition because | ||
922 | * raise_barrer() might be waiting for conf->nr_pending[idx] | ||
923 | * to be 0 at same time. | ||
924 | */ | ||
925 | spin_lock_irq(&conf->resync_lock); | ||
926 | atomic_inc(&conf->nr_waiting[idx]); | ||
927 | atomic_dec(&conf->nr_pending[idx]); | ||
928 | /* | ||
929 | * In case freeze_array() is waiting for | ||
930 | * get_unqueued_pending() == extra | ||
931 | */ | ||
932 | wake_up(&conf->wait_barrier); | ||
933 | /* Wait for the barrier in same barrier unit bucket to drop. */ | ||
934 | wait_event_lock_irq(conf->wait_barrier, | ||
935 | !conf->array_frozen && | ||
936 | !atomic_read(&conf->barrier[idx]), | ||
937 | conf->resync_lock); | ||
938 | atomic_inc(&conf->nr_pending[idx]); | ||
939 | atomic_dec(&conf->nr_waiting[idx]); | ||
940 | spin_unlock_irq(&conf->resync_lock); | ||
875 | } | 941 | } |
876 | 942 | ||
877 | static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | 943 | static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) |
878 | { | 944 | { |
879 | sector_t sector = 0; | 945 | int idx = sector_to_idx(sector_nr); |
880 | 946 | ||
881 | spin_lock_irq(&conf->resync_lock); | 947 | /* |
882 | if (need_to_wait_for_sync(conf, bio)) { | 948 | * Very similar to _wait_barrier(). The difference is, for read |
883 | conf->nr_waiting++; | 949 | * I/O we don't need wait for sync I/O, but if the whole array |
884 | /* Wait for the barrier to drop. | 950 | * is frozen, the read I/O still has to wait until the array is |
885 | * However if there are already pending | 951 | * unfrozen. Since there is no ordering requirement with |
886 | * requests (preventing the barrier from | 952 | * conf->barrier[idx] here, memory barrier is unnecessary as well. |
887 | * rising completely), and the | 953 | */ |
888 | * per-process bio queue isn't empty, | 954 | atomic_inc(&conf->nr_pending[idx]); |
889 | * then don't wait, as we need to empty | ||
890 | * that queue to allow conf->start_next_window | ||
891 | * to increase. | ||
892 | */ | ||
893 | raid1_log(conf->mddev, "wait barrier"); | ||
894 | wait_event_lock_irq(conf->wait_barrier, | ||
895 | !conf->array_frozen && | ||
896 | (!conf->barrier || | ||
897 | ((conf->start_next_window < | ||
898 | conf->next_resync + RESYNC_SECTORS) && | ||
899 | current->bio_list && | ||
900 | !bio_list_empty(current->bio_list))), | ||
901 | conf->resync_lock); | ||
902 | conf->nr_waiting--; | ||
903 | } | ||
904 | |||
905 | if (bio && bio_data_dir(bio) == WRITE) { | ||
906 | if (bio->bi_iter.bi_sector >= conf->next_resync) { | ||
907 | if (conf->start_next_window == MaxSector) | ||
908 | conf->start_next_window = | ||
909 | conf->next_resync + | ||
910 | NEXT_NORMALIO_DISTANCE; | ||
911 | |||
912 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||
913 | <= bio->bi_iter.bi_sector) | ||
914 | conf->next_window_requests++; | ||
915 | else | ||
916 | conf->current_window_requests++; | ||
917 | sector = conf->start_next_window; | ||
918 | } | ||
919 | } | ||
920 | 955 | ||
921 | conf->nr_pending++; | 956 | if (!READ_ONCE(conf->array_frozen)) |
957 | return; | ||
958 | |||
959 | spin_lock_irq(&conf->resync_lock); | ||
960 | atomic_inc(&conf->nr_waiting[idx]); | ||
961 | atomic_dec(&conf->nr_pending[idx]); | ||
962 | /* | ||
963 | * In case freeze_array() is waiting for | ||
964 | * get_unqueued_pending() == extra | ||
965 | */ | ||
966 | wake_up(&conf->wait_barrier); | ||
967 | /* Wait for array to be unfrozen */ | ||
968 | wait_event_lock_irq(conf->wait_barrier, | ||
969 | !conf->array_frozen, | ||
970 | conf->resync_lock); | ||
971 | atomic_inc(&conf->nr_pending[idx]); | ||
972 | atomic_dec(&conf->nr_waiting[idx]); | ||
922 | spin_unlock_irq(&conf->resync_lock); | 973 | spin_unlock_irq(&conf->resync_lock); |
923 | return sector; | ||
924 | } | 974 | } |
925 | 975 | ||
926 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | 976 | static void wait_barrier(struct r1conf *conf, sector_t sector_nr) |
927 | sector_t bi_sector) | ||
928 | { | 977 | { |
929 | unsigned long flags; | 978 | int idx = sector_to_idx(sector_nr); |
930 | 979 | ||
931 | spin_lock_irqsave(&conf->resync_lock, flags); | 980 | _wait_barrier(conf, idx); |
932 | conf->nr_pending--; | 981 | } |
933 | if (start_next_window) { | 982 | |
934 | if (start_next_window == conf->start_next_window) { | 983 | static void wait_all_barriers(struct r1conf *conf) |
935 | if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | 984 | { |
936 | <= bi_sector) | 985 | int idx; |
937 | conf->next_window_requests--; | 986 | |
938 | else | 987 | for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) |
939 | conf->current_window_requests--; | 988 | _wait_barrier(conf, idx); |
940 | } else | 989 | } |
941 | conf->current_window_requests--; | 990 | |
942 | 991 | static void _allow_barrier(struct r1conf *conf, int idx) | |
943 | if (!conf->current_window_requests) { | 992 | { |
944 | if (conf->next_window_requests) { | 993 | atomic_dec(&conf->nr_pending[idx]); |
945 | conf->current_window_requests = | ||
946 | conf->next_window_requests; | ||
947 | conf->next_window_requests = 0; | ||
948 | conf->start_next_window += | ||
949 | NEXT_NORMALIO_DISTANCE; | ||
950 | } else | ||
951 | conf->start_next_window = MaxSector; | ||
952 | } | ||
953 | } | ||
954 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
955 | wake_up(&conf->wait_barrier); | 994 | wake_up(&conf->wait_barrier); |
956 | } | 995 | } |
957 | 996 | ||
997 | static void allow_barrier(struct r1conf *conf, sector_t sector_nr) | ||
998 | { | ||
999 | int idx = sector_to_idx(sector_nr); | ||
1000 | |||
1001 | _allow_barrier(conf, idx); | ||
1002 | } | ||
1003 | |||
1004 | static void allow_all_barriers(struct r1conf *conf) | ||
1005 | { | ||
1006 | int idx; | ||
1007 | |||
1008 | for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | ||
1009 | _allow_barrier(conf, idx); | ||
1010 | } | ||
1011 | |||
1012 | /* conf->resync_lock should be held */ | ||
1013 | static int get_unqueued_pending(struct r1conf *conf) | ||
1014 | { | ||
1015 | int idx, ret; | ||
1016 | |||
1017 | for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | ||
1018 | ret += atomic_read(&conf->nr_pending[idx]) - | ||
1019 | atomic_read(&conf->nr_queued[idx]); | ||
1020 | |||
1021 | return ret; | ||
1022 | } | ||
1023 | |||
958 | static void freeze_array(struct r1conf *conf, int extra) | 1024 | static void freeze_array(struct r1conf *conf, int extra) |
959 | { | 1025 | { |
960 | /* stop syncio and normal IO and wait for everything to | 1026 | /* Stop sync I/O and normal I/O and wait for everything to |
961 | * go quite. | 1027 | * go quite. |
962 | * We wait until nr_pending match nr_queued+extra | 1028 | * This is called in two situations: |
963 | * This is called in the context of one normal IO request | 1029 | * 1) management command handlers (reshape, remove disk, quiesce). |
964 | * that has failed. Thus any sync request that might be pending | 1030 | * 2) one normal I/O request failed. |
965 | * will be blocked by nr_pending, and we need to wait for | 1031 | |
966 | * pending IO requests to complete or be queued for re-try. | 1032 | * After array_frozen is set to 1, new sync IO will be blocked at |
967 | * Thus the number queued (nr_queued) plus this request (extra) | 1033 | * raise_barrier(), and new normal I/O will blocked at _wait_barrier() |
968 | * must match the number of pending IOs (nr_pending) before | 1034 | * or wait_read_barrier(). The flying I/Os will either complete or be |
969 | * we continue. | 1035 | * queued. When everything goes quite, there are only queued I/Os left. |
1036 | |||
1037 | * Every flying I/O contributes to a conf->nr_pending[idx], idx is the | ||
1038 | * barrier bucket index which this I/O request hits. When all sync and | ||
1039 | * normal I/O are queued, sum of all conf->nr_pending[] will match sum | ||
1040 | * of all conf->nr_queued[]. But normal I/O failure is an exception, | ||
1041 | * in handle_read_error(), we may call freeze_array() before trying to | ||
1042 | * fix the read error. In this case, the error read I/O is not queued, | ||
1043 | * so get_unqueued_pending() == 1. | ||
1044 | * | ||
1045 | * Therefore before this function returns, we need to wait until | ||
1046 | * get_unqueued_pendings(conf) gets equal to extra. For | ||
1047 | * normal I/O context, extra is 1, in rested situations extra is 0. | ||
970 | */ | 1048 | */ |
971 | spin_lock_irq(&conf->resync_lock); | 1049 | spin_lock_irq(&conf->resync_lock); |
972 | conf->array_frozen = 1; | 1050 | conf->array_frozen = 1; |
973 | raid1_log(conf->mddev, "wait freeze"); | 1051 | raid1_log(conf->mddev, "wait freeze"); |
974 | wait_event_lock_irq_cmd(conf->wait_barrier, | 1052 | wait_event_lock_irq_cmd( |
975 | conf->nr_pending == conf->nr_queued+extra, | 1053 | conf->wait_barrier, |
976 | conf->resync_lock, | 1054 | get_unqueued_pending(conf) == extra, |
977 | flush_pending_writes(conf)); | 1055 | conf->resync_lock, |
1056 | flush_pending_writes(conf)); | ||
978 | spin_unlock_irq(&conf->resync_lock); | 1057 | spin_unlock_irq(&conf->resync_lock); |
979 | } | 1058 | } |
980 | static void unfreeze_array(struct r1conf *conf) | 1059 | static void unfreeze_array(struct r1conf *conf) |
@@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf) | |||
982 | /* reverse the effect of the freeze */ | 1061 | /* reverse the effect of the freeze */ |
983 | spin_lock_irq(&conf->resync_lock); | 1062 | spin_lock_irq(&conf->resync_lock); |
984 | conf->array_frozen = 0; | 1063 | conf->array_frozen = 0; |
985 | wake_up(&conf->wait_barrier); | ||
986 | spin_unlock_irq(&conf->resync_lock); | 1064 | spin_unlock_irq(&conf->resync_lock); |
1065 | wake_up(&conf->wait_barrier); | ||
987 | } | 1066 | } |
988 | 1067 | ||
989 | /* duplicate the data pages for behind I/O | 1068 | /* duplicate the data pages for behind I/O |
@@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
1070 | kfree(plug); | 1149 | kfree(plug); |
1071 | } | 1150 | } |
1072 | 1151 | ||
1073 | static void raid1_read_request(struct mddev *mddev, struct bio *bio, | 1152 | static inline struct r1bio * |
1074 | struct r1bio *r1_bio) | 1153 | alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) |
1154 | { | ||
1155 | struct r1conf *conf = mddev->private; | ||
1156 | struct r1bio *r1_bio; | ||
1157 | |||
1158 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
1159 | |||
1160 | r1_bio->master_bio = bio; | ||
1161 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
1162 | r1_bio->state = 0; | ||
1163 | r1_bio->mddev = mddev; | ||
1164 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
1165 | |||
1166 | return r1_bio; | ||
1167 | } | ||
1168 | |||
1169 | static void raid1_read_request(struct mddev *mddev, struct bio *bio) | ||
1075 | { | 1170 | { |
1076 | struct r1conf *conf = mddev->private; | 1171 | struct r1conf *conf = mddev->private; |
1077 | struct raid1_info *mirror; | 1172 | struct raid1_info *mirror; |
1173 | struct r1bio *r1_bio; | ||
1078 | struct bio *read_bio; | 1174 | struct bio *read_bio; |
1079 | struct bitmap *bitmap = mddev->bitmap; | 1175 | struct bitmap *bitmap = mddev->bitmap; |
1080 | const int op = bio_op(bio); | 1176 | const int op = bio_op(bio); |
@@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, | |||
1083 | int max_sectors; | 1179 | int max_sectors; |
1084 | int rdisk; | 1180 | int rdisk; |
1085 | 1181 | ||
1086 | wait_barrier(conf, bio); | 1182 | /* |
1183 | * Still need barrier for READ in case that whole | ||
1184 | * array is frozen. | ||
1185 | */ | ||
1186 | wait_read_barrier(conf, bio->bi_iter.bi_sector); | ||
1187 | |||
1188 | r1_bio = alloc_r1bio(mddev, bio, 0); | ||
1087 | 1189 | ||
1190 | /* | ||
1191 | * We might need to issue multiple reads to different | ||
1192 | * devices if there are bad blocks around, so we keep | ||
1193 | * track of the number of reads in bio->bi_phys_segments. | ||
1194 | * If this is 0, there is only one r1_bio and no locking | ||
1195 | * will be needed when requests complete. If it is | ||
1196 | * non-zero, then it is the number of not-completed requests. | ||
1197 | */ | ||
1198 | bio->bi_phys_segments = 0; | ||
1199 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
1200 | |||
1201 | /* | ||
1202 | * make_request() can abort the operation when read-ahead is being | ||
1203 | * used and no empty request is available. | ||
1204 | */ | ||
1088 | read_again: | 1205 | read_again: |
1089 | rdisk = read_balance(conf, r1_bio, &max_sectors); | 1206 | rdisk = read_balance(conf, r1_bio, &max_sectors); |
1090 | 1207 | ||
@@ -1106,9 +1223,8 @@ read_again: | |||
1106 | atomic_read(&bitmap->behind_writes) == 0); | 1223 | atomic_read(&bitmap->behind_writes) == 0); |
1107 | } | 1224 | } |
1108 | r1_bio->read_disk = rdisk; | 1225 | r1_bio->read_disk = rdisk; |
1109 | r1_bio->start_next_window = 0; | ||
1110 | 1226 | ||
1111 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1227 | read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
1112 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, | 1228 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, |
1113 | max_sectors); | 1229 | max_sectors); |
1114 | 1230 | ||
@@ -1151,22 +1267,16 @@ read_again: | |||
1151 | */ | 1267 | */ |
1152 | reschedule_retry(r1_bio); | 1268 | reschedule_retry(r1_bio); |
1153 | 1269 | ||
1154 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 1270 | r1_bio = alloc_r1bio(mddev, bio, sectors_handled); |
1155 | |||
1156 | r1_bio->master_bio = bio; | ||
1157 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
1158 | r1_bio->state = 0; | ||
1159 | r1_bio->mddev = mddev; | ||
1160 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
1161 | goto read_again; | 1271 | goto read_again; |
1162 | } else | 1272 | } else |
1163 | generic_make_request(read_bio); | 1273 | generic_make_request(read_bio); |
1164 | } | 1274 | } |
1165 | 1275 | ||
1166 | static void raid1_write_request(struct mddev *mddev, struct bio *bio, | 1276 | static void raid1_write_request(struct mddev *mddev, struct bio *bio) |
1167 | struct r1bio *r1_bio) | ||
1168 | { | 1277 | { |
1169 | struct r1conf *conf = mddev->private; | 1278 | struct r1conf *conf = mddev->private; |
1279 | struct r1bio *r1_bio; | ||
1170 | int i, disks; | 1280 | int i, disks; |
1171 | struct bitmap *bitmap = mddev->bitmap; | 1281 | struct bitmap *bitmap = mddev->bitmap; |
1172 | unsigned long flags; | 1282 | unsigned long flags; |
@@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1176 | int first_clone; | 1286 | int first_clone; |
1177 | int sectors_handled; | 1287 | int sectors_handled; |
1178 | int max_sectors; | 1288 | int max_sectors; |
1179 | sector_t start_next_window; | ||
1180 | 1289 | ||
1181 | /* | 1290 | /* |
1182 | * Register the new request and wait if the reconstruction | 1291 | * Register the new request and wait if the reconstruction |
@@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1212 | } | 1321 | } |
1213 | finish_wait(&conf->wait_barrier, &w); | 1322 | finish_wait(&conf->wait_barrier, &w); |
1214 | } | 1323 | } |
1215 | start_next_window = wait_barrier(conf, bio); | 1324 | wait_barrier(conf, bio->bi_iter.bi_sector); |
1325 | |||
1326 | r1_bio = alloc_r1bio(mddev, bio, 0); | ||
1327 | |||
1328 | /* We might need to issue multiple writes to different | ||
1329 | * devices if there are bad blocks around, so we keep | ||
1330 | * track of the number of writes in bio->bi_phys_segments. | ||
1331 | * If this is 0, there is only one r1_bio and no locking | ||
1332 | * will be needed when requests complete. If it is | ||
1333 | * non-zero, then it is the number of not-completed requests. | ||
1334 | */ | ||
1335 | bio->bi_phys_segments = 0; | ||
1336 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
1216 | 1337 | ||
1217 | if (conf->pending_count >= max_queued_requests) { | 1338 | if (conf->pending_count >= max_queued_requests) { |
1218 | md_wakeup_thread(mddev->thread); | 1339 | md_wakeup_thread(mddev->thread); |
@@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1233 | 1354 | ||
1234 | disks = conf->raid_disks * 2; | 1355 | disks = conf->raid_disks * 2; |
1235 | retry_write: | 1356 | retry_write: |
1236 | r1_bio->start_next_window = start_next_window; | ||
1237 | blocked_rdev = NULL; | 1357 | blocked_rdev = NULL; |
1238 | rcu_read_lock(); | 1358 | rcu_read_lock(); |
1239 | max_sectors = r1_bio->sectors; | 1359 | max_sectors = r1_bio->sectors; |
@@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1300 | if (unlikely(blocked_rdev)) { | 1420 | if (unlikely(blocked_rdev)) { |
1301 | /* Wait for this device to become unblocked */ | 1421 | /* Wait for this device to become unblocked */ |
1302 | int j; | 1422 | int j; |
1303 | sector_t old = start_next_window; | ||
1304 | 1423 | ||
1305 | for (j = 0; j < i; j++) | 1424 | for (j = 0; j < i; j++) |
1306 | if (r1_bio->bios[j]) | 1425 | if (r1_bio->bios[j]) |
1307 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1426 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
1308 | r1_bio->state = 0; | 1427 | r1_bio->state = 0; |
1309 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); | 1428 | allow_barrier(conf, bio->bi_iter.bi_sector); |
1310 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | 1429 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); |
1311 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1430 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1312 | start_next_window = wait_barrier(conf, bio); | 1431 | wait_barrier(conf, bio->bi_iter.bi_sector); |
1313 | /* | ||
1314 | * We must make sure the multi r1bios of bio have | ||
1315 | * the same value of bi_phys_segments | ||
1316 | */ | ||
1317 | if (bio->bi_phys_segments && old && | ||
1318 | old != start_next_window) | ||
1319 | /* Wait for the former r1bio(s) to complete */ | ||
1320 | wait_event(conf->wait_barrier, | ||
1321 | bio->bi_phys_segments == 1); | ||
1322 | goto retry_write; | 1432 | goto retry_write; |
1323 | } | 1433 | } |
1324 | 1434 | ||
@@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1341 | 1451 | ||
1342 | first_clone = 1; | 1452 | first_clone = 1; |
1343 | for (i = 0; i < disks; i++) { | 1453 | for (i = 0; i < disks; i++) { |
1344 | struct bio *mbio; | 1454 | struct bio *mbio = NULL; |
1455 | sector_t offset; | ||
1345 | if (!r1_bio->bios[i]) | 1456 | if (!r1_bio->bios[i]) |
1346 | continue; | 1457 | continue; |
1347 | 1458 | ||
1348 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1459 | offset = r1_bio->sector - bio->bi_iter.bi_sector; |
1349 | bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, | ||
1350 | max_sectors); | ||
1351 | 1460 | ||
1352 | if (first_clone) { | 1461 | if (first_clone) { |
1353 | /* do behind I/O ? | 1462 | /* do behind I/O ? |
@@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1357 | if (bitmap && | 1466 | if (bitmap && |
1358 | (atomic_read(&bitmap->behind_writes) | 1467 | (atomic_read(&bitmap->behind_writes) |
1359 | < mddev->bitmap_info.max_write_behind) && | 1468 | < mddev->bitmap_info.max_write_behind) && |
1360 | !waitqueue_active(&bitmap->behind_wait)) | 1469 | !waitqueue_active(&bitmap->behind_wait)) { |
1470 | mbio = bio_clone_bioset_partial(bio, GFP_NOIO, | ||
1471 | mddev->bio_set, | ||
1472 | offset << 9, | ||
1473 | max_sectors << 9); | ||
1361 | alloc_behind_pages(mbio, r1_bio); | 1474 | alloc_behind_pages(mbio, r1_bio); |
1475 | } | ||
1362 | 1476 | ||
1363 | bitmap_startwrite(bitmap, r1_bio->sector, | 1477 | bitmap_startwrite(bitmap, r1_bio->sector, |
1364 | r1_bio->sectors, | 1478 | r1_bio->sectors, |
@@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1366 | &r1_bio->state)); | 1480 | &r1_bio->state)); |
1367 | first_clone = 0; | 1481 | first_clone = 0; |
1368 | } | 1482 | } |
1483 | |||
1484 | if (!mbio) { | ||
1485 | if (r1_bio->behind_bvecs) | ||
1486 | mbio = bio_clone_bioset_partial(bio, GFP_NOIO, | ||
1487 | mddev->bio_set, | ||
1488 | offset << 9, | ||
1489 | max_sectors << 9); | ||
1490 | else { | ||
1491 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); | ||
1492 | bio_trim(mbio, offset, max_sectors); | ||
1493 | } | ||
1494 | } | ||
1495 | |||
1369 | if (r1_bio->behind_bvecs) { | 1496 | if (r1_bio->behind_bvecs) { |
1370 | struct bio_vec *bvec; | 1497 | struct bio_vec *bvec; |
1371 | int j; | 1498 | int j; |
@@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1385 | conf->mirrors[i].rdev->data_offset); | 1512 | conf->mirrors[i].rdev->data_offset); |
1386 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1513 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1387 | mbio->bi_end_io = raid1_end_write_request; | 1514 | mbio->bi_end_io = raid1_end_write_request; |
1388 | mbio->bi_opf = bio_op(bio) | | 1515 | mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); |
1389 | (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA)); | ||
1390 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && | 1516 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && |
1391 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && | 1517 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && |
1392 | conf->raid_disks - mddev->degraded > 1) | 1518 | conf->raid_disks - mddev->degraded > 1) |
@@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1427 | /* We need another r1_bio. It has already been counted | 1553 | /* We need another r1_bio. It has already been counted |
1428 | * in bio->bi_phys_segments | 1554 | * in bio->bi_phys_segments |
1429 | */ | 1555 | */ |
1430 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 1556 | r1_bio = alloc_r1bio(mddev, bio, sectors_handled); |
1431 | r1_bio->master_bio = bio; | ||
1432 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
1433 | r1_bio->state = 0; | ||
1434 | r1_bio->mddev = mddev; | ||
1435 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
1436 | goto retry_write; | 1557 | goto retry_write; |
1437 | } | 1558 | } |
1438 | 1559 | ||
@@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
1444 | 1565 | ||
1445 | static void raid1_make_request(struct mddev *mddev, struct bio *bio) | 1566 | static void raid1_make_request(struct mddev *mddev, struct bio *bio) |
1446 | { | 1567 | { |
1447 | struct r1conf *conf = mddev->private; | 1568 | struct bio *split; |
1448 | struct r1bio *r1_bio; | 1569 | sector_t sectors; |
1449 | 1570 | ||
1450 | /* | 1571 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { |
1451 | * make_request() can abort the operation when read-ahead is being | 1572 | md_flush_request(mddev, bio); |
1452 | * used and no empty request is available. | 1573 | return; |
1453 | * | 1574 | } |
1454 | */ | ||
1455 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
1456 | |||
1457 | r1_bio->master_bio = bio; | ||
1458 | r1_bio->sectors = bio_sectors(bio); | ||
1459 | r1_bio->state = 0; | ||
1460 | r1_bio->mddev = mddev; | ||
1461 | r1_bio->sector = bio->bi_iter.bi_sector; | ||
1462 | 1575 | ||
1463 | /* | 1576 | /* if bio exceeds barrier unit boundary, split it */ |
1464 | * We might need to issue multiple reads to different devices if there | 1577 | do { |
1465 | * are bad blocks around, so we keep track of the number of reads in | 1578 | sectors = align_to_barrier_unit_end( |
1466 | * bio->bi_phys_segments. If this is 0, there is only one r1_bio and | 1579 | bio->bi_iter.bi_sector, bio_sectors(bio)); |
1467 | * no locking will be needed when requests complete. If it is | 1580 | if (sectors < bio_sectors(bio)) { |
1468 | * non-zero, then it is the number of not-completed requests. | 1581 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); |
1469 | */ | 1582 | bio_chain(split, bio); |
1470 | bio->bi_phys_segments = 0; | 1583 | } else { |
1471 | bio_clear_flag(bio, BIO_SEG_VALID); | 1584 | split = bio; |
1585 | } | ||
1472 | 1586 | ||
1473 | if (bio_data_dir(bio) == READ) | 1587 | if (bio_data_dir(split) == READ) |
1474 | raid1_read_request(mddev, bio, r1_bio); | 1588 | raid1_read_request(mddev, split); |
1475 | else | 1589 | else |
1476 | raid1_write_request(mddev, bio, r1_bio); | 1590 | raid1_write_request(mddev, split); |
1591 | } while (split != bio); | ||
1477 | } | 1592 | } |
1478 | 1593 | ||
1479 | static void raid1_status(struct seq_file *seq, struct mddev *mddev) | 1594 | static void raid1_status(struct seq_file *seq, struct mddev *mddev) |
@@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf) | |||
1564 | 1679 | ||
1565 | static void close_sync(struct r1conf *conf) | 1680 | static void close_sync(struct r1conf *conf) |
1566 | { | 1681 | { |
1567 | wait_barrier(conf, NULL); | 1682 | wait_all_barriers(conf); |
1568 | allow_barrier(conf, 0, 0); | 1683 | allow_all_barriers(conf); |
1569 | 1684 | ||
1570 | mempool_destroy(conf->r1buf_pool); | 1685 | mempool_destroy(conf->r1buf_pool); |
1571 | conf->r1buf_pool = NULL; | 1686 | conf->r1buf_pool = NULL; |
1572 | |||
1573 | spin_lock_irq(&conf->resync_lock); | ||
1574 | conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; | ||
1575 | conf->start_next_window = MaxSector; | ||
1576 | conf->current_window_requests += | ||
1577 | conf->next_window_requests; | ||
1578 | conf->next_window_requests = 0; | ||
1579 | spin_unlock_irq(&conf->resync_lock); | ||
1580 | } | 1687 | } |
1581 | 1688 | ||
1582 | static int raid1_spare_active(struct mddev *mddev) | 1689 | static int raid1_spare_active(struct mddev *mddev) |
@@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) | |||
2273 | 2380 | ||
2274 | wbio->bi_vcnt = vcnt; | 2381 | wbio->bi_vcnt = vcnt; |
2275 | } else { | 2382 | } else { |
2276 | wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2383 | wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, |
2384 | mddev->bio_set); | ||
2277 | } | 2385 | } |
2278 | 2386 | ||
2279 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); | 2387 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); |
@@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
2323 | 2431 | ||
2324 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | 2432 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) |
2325 | { | 2433 | { |
2326 | int m; | 2434 | int m, idx; |
2327 | bool fail = false; | 2435 | bool fail = false; |
2436 | |||
2328 | for (m = 0; m < conf->raid_disks * 2 ; m++) | 2437 | for (m = 0; m < conf->raid_disks * 2 ; m++) |
2329 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | 2438 | if (r1_bio->bios[m] == IO_MADE_GOOD) { |
2330 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 2439 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
@@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | |||
2350 | if (fail) { | 2459 | if (fail) { |
2351 | spin_lock_irq(&conf->device_lock); | 2460 | spin_lock_irq(&conf->device_lock); |
2352 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); | 2461 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); |
2353 | conf->nr_queued++; | 2462 | idx = sector_to_idx(r1_bio->sector); |
2463 | atomic_inc(&conf->nr_queued[idx]); | ||
2354 | spin_unlock_irq(&conf->device_lock); | 2464 | spin_unlock_irq(&conf->device_lock); |
2465 | /* | ||
2466 | * In case freeze_array() is waiting for condition | ||
2467 | * get_unqueued_pending() == extra to be true. | ||
2468 | */ | ||
2469 | wake_up(&conf->wait_barrier); | ||
2355 | md_wakeup_thread(conf->mddev->thread); | 2470 | md_wakeup_thread(conf->mddev->thread); |
2356 | } else { | 2471 | } else { |
2357 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2472 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
@@ -2411,7 +2526,8 @@ read_more: | |||
2411 | const unsigned long do_sync | 2526 | const unsigned long do_sync |
2412 | = r1_bio->master_bio->bi_opf & REQ_SYNC; | 2527 | = r1_bio->master_bio->bi_opf & REQ_SYNC; |
2413 | r1_bio->read_disk = disk; | 2528 | r1_bio->read_disk = disk; |
2414 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2529 | bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, |
2530 | mddev->bio_set); | ||
2415 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, | 2531 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, |
2416 | max_sectors); | 2532 | max_sectors); |
2417 | r1_bio->bios[r1_bio->read_disk] = bio; | 2533 | r1_bio->bios[r1_bio->read_disk] = bio; |
@@ -2445,15 +2561,8 @@ read_more: | |||
2445 | generic_make_request(bio); | 2561 | generic_make_request(bio); |
2446 | bio = NULL; | 2562 | bio = NULL; |
2447 | 2563 | ||
2448 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 2564 | r1_bio = alloc_r1bio(mddev, mbio, sectors_handled); |
2449 | |||
2450 | r1_bio->master_bio = mbio; | ||
2451 | r1_bio->sectors = bio_sectors(mbio) - sectors_handled; | ||
2452 | r1_bio->state = 0; | ||
2453 | set_bit(R1BIO_ReadError, &r1_bio->state); | 2565 | set_bit(R1BIO_ReadError, &r1_bio->state); |
2454 | r1_bio->mddev = mddev; | ||
2455 | r1_bio->sector = mbio->bi_iter.bi_sector + | ||
2456 | sectors_handled; | ||
2457 | 2566 | ||
2458 | goto read_more; | 2567 | goto read_more; |
2459 | } else { | 2568 | } else { |
@@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread) | |||
2472 | struct r1conf *conf = mddev->private; | 2581 | struct r1conf *conf = mddev->private; |
2473 | struct list_head *head = &conf->retry_list; | 2582 | struct list_head *head = &conf->retry_list; |
2474 | struct blk_plug plug; | 2583 | struct blk_plug plug; |
2584 | int idx; | ||
2475 | 2585 | ||
2476 | md_check_recovery(mddev); | 2586 | md_check_recovery(mddev); |
2477 | 2587 | ||
@@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread) | |||
2479 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { | 2589 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
2480 | LIST_HEAD(tmp); | 2590 | LIST_HEAD(tmp); |
2481 | spin_lock_irqsave(&conf->device_lock, flags); | 2591 | spin_lock_irqsave(&conf->device_lock, flags); |
2482 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { | 2592 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
2483 | while (!list_empty(&conf->bio_end_io_list)) { | 2593 | list_splice_init(&conf->bio_end_io_list, &tmp); |
2484 | list_move(conf->bio_end_io_list.prev, &tmp); | ||
2485 | conf->nr_queued--; | ||
2486 | } | ||
2487 | } | ||
2488 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2594 | spin_unlock_irqrestore(&conf->device_lock, flags); |
2489 | while (!list_empty(&tmp)) { | 2595 | while (!list_empty(&tmp)) { |
2490 | r1_bio = list_first_entry(&tmp, struct r1bio, | 2596 | r1_bio = list_first_entry(&tmp, struct r1bio, |
2491 | retry_list); | 2597 | retry_list); |
2492 | list_del(&r1_bio->retry_list); | 2598 | list_del(&r1_bio->retry_list); |
2599 | idx = sector_to_idx(r1_bio->sector); | ||
2600 | atomic_dec(&conf->nr_queued[idx]); | ||
2493 | if (mddev->degraded) | 2601 | if (mddev->degraded) |
2494 | set_bit(R1BIO_Degraded, &r1_bio->state); | 2602 | set_bit(R1BIO_Degraded, &r1_bio->state); |
2495 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2603 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
@@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread) | |||
2510 | } | 2618 | } |
2511 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); | 2619 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); |
2512 | list_del(head->prev); | 2620 | list_del(head->prev); |
2513 | conf->nr_queued--; | 2621 | idx = sector_to_idx(r1_bio->sector); |
2622 | atomic_dec(&conf->nr_queued[idx]); | ||
2514 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2623 | spin_unlock_irqrestore(&conf->device_lock, flags); |
2515 | 2624 | ||
2516 | mddev = r1_bio->mddev; | 2625 | mddev = r1_bio->mddev; |
@@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf) | |||
2549 | conf->poolinfo); | 2658 | conf->poolinfo); |
2550 | if (!conf->r1buf_pool) | 2659 | if (!conf->r1buf_pool) |
2551 | return -ENOMEM; | 2660 | return -ENOMEM; |
2552 | conf->next_resync = 0; | ||
2553 | return 0; | 2661 | return 0; |
2554 | } | 2662 | } |
2555 | 2663 | ||
@@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2578 | int still_degraded = 0; | 2686 | int still_degraded = 0; |
2579 | int good_sectors = RESYNC_SECTORS; | 2687 | int good_sectors = RESYNC_SECTORS; |
2580 | int min_bad = 0; /* number of sectors that are bad in all devices */ | 2688 | int min_bad = 0; /* number of sectors that are bad in all devices */ |
2689 | int idx = sector_to_idx(sector_nr); | ||
2581 | 2690 | ||
2582 | if (!conf->r1buf_pool) | 2691 | if (!conf->r1buf_pool) |
2583 | if (init_resync(conf)) | 2692 | if (init_resync(conf)) |
@@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2627 | * If there is non-resync activity waiting for a turn, then let it | 2736 | * If there is non-resync activity waiting for a turn, then let it |
2628 | * though before starting on this new sync request. | 2737 | * though before starting on this new sync request. |
2629 | */ | 2738 | */ |
2630 | if (conf->nr_waiting) | 2739 | if (atomic_read(&conf->nr_waiting[idx])) |
2631 | schedule_timeout_uninterruptible(1); | 2740 | schedule_timeout_uninterruptible(1); |
2632 | 2741 | ||
2633 | /* we are incrementing sector_nr below. To be safe, we check against | 2742 | /* we are incrementing sector_nr below. To be safe, we check against |
@@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2654 | r1_bio->sector = sector_nr; | 2763 | r1_bio->sector = sector_nr; |
2655 | r1_bio->state = 0; | 2764 | r1_bio->state = 0; |
2656 | set_bit(R1BIO_IsSync, &r1_bio->state); | 2765 | set_bit(R1BIO_IsSync, &r1_bio->state); |
2766 | /* make sure good_sectors won't go across barrier unit boundary */ | ||
2767 | good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); | ||
2657 | 2768 | ||
2658 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2769 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2659 | struct md_rdev *rdev; | 2770 | struct md_rdev *rdev; |
@@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2884 | if (!conf) | 2995 | if (!conf) |
2885 | goto abort; | 2996 | goto abort; |
2886 | 2997 | ||
2998 | conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, | ||
2999 | sizeof(atomic_t), GFP_KERNEL); | ||
3000 | if (!conf->nr_pending) | ||
3001 | goto abort; | ||
3002 | |||
3003 | conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, | ||
3004 | sizeof(atomic_t), GFP_KERNEL); | ||
3005 | if (!conf->nr_waiting) | ||
3006 | goto abort; | ||
3007 | |||
3008 | conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, | ||
3009 | sizeof(atomic_t), GFP_KERNEL); | ||
3010 | if (!conf->nr_queued) | ||
3011 | goto abort; | ||
3012 | |||
3013 | conf->barrier = kcalloc(BARRIER_BUCKETS_NR, | ||
3014 | sizeof(atomic_t), GFP_KERNEL); | ||
3015 | if (!conf->barrier) | ||
3016 | goto abort; | ||
3017 | |||
2887 | conf->mirrors = kzalloc(sizeof(struct raid1_info) | 3018 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
2888 | * mddev->raid_disks * 2, | 3019 | * mddev->raid_disks * 2, |
2889 | GFP_KERNEL); | 3020 | GFP_KERNEL); |
@@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2939 | conf->pending_count = 0; | 3070 | conf->pending_count = 0; |
2940 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 3071 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
2941 | 3072 | ||
2942 | conf->start_next_window = MaxSector; | ||
2943 | conf->current_window_requests = conf->next_window_requests = 0; | ||
2944 | |||
2945 | err = -EIO; | 3073 | err = -EIO; |
2946 | for (i = 0; i < conf->raid_disks * 2; i++) { | 3074 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2947 | 3075 | ||
@@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2984 | kfree(conf->mirrors); | 3112 | kfree(conf->mirrors); |
2985 | safe_put_page(conf->tmppage); | 3113 | safe_put_page(conf->tmppage); |
2986 | kfree(conf->poolinfo); | 3114 | kfree(conf->poolinfo); |
3115 | kfree(conf->nr_pending); | ||
3116 | kfree(conf->nr_waiting); | ||
3117 | kfree(conf->nr_queued); | ||
3118 | kfree(conf->barrier); | ||
2987 | kfree(conf); | 3119 | kfree(conf); |
2988 | } | 3120 | } |
2989 | return ERR_PTR(err); | 3121 | return ERR_PTR(err); |
@@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv) | |||
3085 | kfree(conf->mirrors); | 3217 | kfree(conf->mirrors); |
3086 | safe_put_page(conf->tmppage); | 3218 | safe_put_page(conf->tmppage); |
3087 | kfree(conf->poolinfo); | 3219 | kfree(conf->poolinfo); |
3220 | kfree(conf->nr_pending); | ||
3221 | kfree(conf->nr_waiting); | ||
3222 | kfree(conf->nr_queued); | ||
3223 | kfree(conf->barrier); | ||
3088 | kfree(conf); | 3224 | kfree(conf); |
3089 | } | 3225 | } |
3090 | 3226 | ||