diff options
Diffstat (limited to 'drivers/md/raid1.c')
| -rw-r--r-- | drivers/md/raid1.c | 596 |
1 files changed, 366 insertions, 230 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 830ff2b20346..7453d94eeed7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -71,9 +71,8 @@ | |||
| 71 | */ | 71 | */ |
| 72 | static int max_queued_requests = 1024; | 72 | static int max_queued_requests = 1024; |
| 73 | 73 | ||
| 74 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | 74 | static void allow_barrier(struct r1conf *conf, sector_t sector_nr); |
| 75 | sector_t bi_sector); | 75 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr); |
| 76 | static void lower_barrier(struct r1conf *conf); | ||
| 77 | 76 | ||
| 78 | #define raid1_log(md, fmt, args...) \ | 77 | #define raid1_log(md, fmt, args...) \ |
| 79 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) | 78 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) |
| @@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) | |||
| 100 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | 99 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) |
| 101 | #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) | 100 | #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) |
| 102 | #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) | 101 | #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) |
| 103 | #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||
| 104 | 102 | ||
| 105 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | 103 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) |
| 106 | { | 104 | { |
| @@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio) | |||
| 205 | static void put_buf(struct r1bio *r1_bio) | 203 | static void put_buf(struct r1bio *r1_bio) |
| 206 | { | 204 | { |
| 207 | struct r1conf *conf = r1_bio->mddev->private; | 205 | struct r1conf *conf = r1_bio->mddev->private; |
| 206 | sector_t sect = r1_bio->sector; | ||
| 208 | int i; | 207 | int i; |
| 209 | 208 | ||
| 210 | for (i = 0; i < conf->raid_disks * 2; i++) { | 209 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| @@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio) | |||
| 215 | 214 | ||
| 216 | mempool_free(r1_bio, conf->r1buf_pool); | 215 | mempool_free(r1_bio, conf->r1buf_pool); |
| 217 | 216 | ||
| 218 | lower_barrier(conf); | 217 | lower_barrier(conf, sect); |
| 219 | } | 218 | } |
| 220 | 219 | ||
| 221 | static void reschedule_retry(struct r1bio *r1_bio) | 220 | static void reschedule_retry(struct r1bio *r1_bio) |
| @@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio) | |||
| 223 | unsigned long flags; | 222 | unsigned long flags; |
| 224 | struct mddev *mddev = r1_bio->mddev; | 223 | struct mddev *mddev = r1_bio->mddev; |
| 225 | struct r1conf *conf = mddev->private; | 224 | struct r1conf *conf = mddev->private; |
| 225 | int idx; | ||
| 226 | 226 | ||
| 227 | idx = sector_to_idx(r1_bio->sector); | ||
| 227 | spin_lock_irqsave(&conf->device_lock, flags); | 228 | spin_lock_irqsave(&conf->device_lock, flags); |
| 228 | list_add(&r1_bio->retry_list, &conf->retry_list); | 229 | list_add(&r1_bio->retry_list, &conf->retry_list); |
| 229 | conf->nr_queued ++; | 230 | atomic_inc(&conf->nr_queued[idx]); |
| 230 | spin_unlock_irqrestore(&conf->device_lock, flags); | 231 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 231 | 232 | ||
| 232 | wake_up(&conf->wait_barrier); | 233 | wake_up(&conf->wait_barrier); |
| @@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 243 | struct bio *bio = r1_bio->master_bio; | 244 | struct bio *bio = r1_bio->master_bio; |
| 244 | int done; | 245 | int done; |
| 245 | struct r1conf *conf = r1_bio->mddev->private; | 246 | struct r1conf *conf = r1_bio->mddev->private; |
| 246 | sector_t start_next_window = r1_bio->start_next_window; | ||
| 247 | sector_t bi_sector = bio->bi_iter.bi_sector; | 247 | sector_t bi_sector = bio->bi_iter.bi_sector; |
| 248 | 248 | ||
| 249 | if (bio->bi_phys_segments) { | 249 | if (bio->bi_phys_segments) { |
| @@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 269 | * Wake up any possible resync thread that waits for the device | 269 | * Wake up any possible resync thread that waits for the device |
| 270 | * to go idle. | 270 | * to go idle. |
| 271 | */ | 271 | */ |
| 272 | allow_barrier(conf, start_next_window, bi_sector); | 272 | allow_barrier(conf, bi_sector); |
| 273 | } | 273 | } |
| 274 | } | 274 | } |
| 275 | 275 | ||
| @@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio) | |||
| 517 | bio_put(to_put); | 517 | bio_put(to_put); |
| 518 | } | 518 | } |
| 519 | 519 | ||
| 520 | static sector_t align_to_barrier_unit_end(sector_t start_sector, | ||
| 521 | sector_t sectors) | ||
| 522 | { | ||
| 523 | sector_t len; | ||
| 524 | |||
| 525 | WARN_ON(sectors == 0); | ||
| 526 | /* | ||
| 527 | * len is the number of sectors from start_sector to end of the | ||
| 528 | * barrier unit which start_sector belongs to. | ||
| 529 | */ | ||
| 530 | len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - | ||
| 531 | start_sector; | ||
| 532 | |||
| 533 | if (len > sectors) | ||
| 534 | len = sectors; | ||
| 535 | |||
| 536 | return len; | ||
| 537 | } | ||
| 538 | |||
| 520 | /* | 539 | /* |
| 521 | * This routine returns the disk from which the requested read should | 540 | * This routine returns the disk from which the requested read should |
| 522 | * be done. There is a per-array 'next expected sequential IO' sector | 541 | * be done. There is a per-array 'next expected sequential IO' sector |
| @@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf) | |||
| 813 | */ | 832 | */ |
| 814 | static void raise_barrier(struct r1conf *conf, sector_t sector_nr) | 833 | static void raise_barrier(struct r1conf *conf, sector_t sector_nr) |
| 815 | { | 834 | { |
| 835 | int idx = sector_to_idx(sector_nr); | ||
| 836 | |||
| 816 | spin_lock_irq(&conf->resync_lock); | 837 | spin_lock_irq(&conf->resync_lock); |
| 817 | 838 | ||
| 818 | /* Wait until no block IO is waiting */ | 839 | /* Wait until no block IO is waiting */ |
| 819 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | 840 | wait_event_lock_irq(conf->wait_barrier, |
| 841 | !atomic_read(&conf->nr_waiting[idx]), | ||
| 820 | conf->resync_lock); | 842 | conf->resync_lock); |
| 821 | 843 | ||
| 822 | /* block any new IO from starting */ | 844 | /* block any new IO from starting */ |
| 823 | conf->barrier++; | 845 | atomic_inc(&conf->barrier[idx]); |
| 824 | conf->next_resync = sector_nr; | 846 | /* |
| 847 | * In raise_barrier() we firstly increase conf->barrier[idx] then | ||
| 848 | * check conf->nr_pending[idx]. In _wait_barrier() we firstly | ||
| 849 | * increase conf->nr_pending[idx] then check conf->barrier[idx]. | ||
| 850 | * A memory barrier here to make sure conf->nr_pending[idx] won't | ||
| 851 | * be fetched before conf->barrier[idx] is increased. Otherwise | ||
| 852 | * there will be a race between raise_barrier() and _wait_barrier(). | ||
| 853 | */ | ||
| 854 | smp_mb__after_atomic(); | ||
| 825 | 855 | ||
| 826 | /* For these conditions we must wait: | 856 | /* For these conditions we must wait: |
| 827 | * A: while the array is in frozen state | 857 | * A: while the array is in frozen state |
| 828 | * B: while barrier >= RESYNC_DEPTH, meaning resync reach | 858 | * B: while conf->nr_pending[idx] is not 0, meaning regular I/O |
| 829 | * the max count which allowed. | 859 | * existing in corresponding I/O barrier bucket. |
| 830 | * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | 860 | * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches |
| 831 | * next resync will reach to the window which normal bios are | 861 | * max resync count which allowed on current I/O barrier bucket. |
| 832 | * handling. | ||
| 833 | * D: while there are any active requests in the current window. | ||
| 834 | */ | 862 | */ |
| 835 | wait_event_lock_irq(conf->wait_barrier, | 863 | wait_event_lock_irq(conf->wait_barrier, |
| 836 | !conf->array_frozen && | 864 | !conf->array_frozen && |
| 837 | conf->barrier < RESYNC_DEPTH && | 865 | !atomic_read(&conf->nr_pending[idx]) && |
| 838 | conf->current_window_requests == 0 && | 866 | atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, |
| 839 | (conf->start_next_window >= | ||
| 840 | conf->next_resync + RESYNC_SECTORS), | ||
| 841 | conf->resync_lock); | 867 | conf->resync_lock); |
| 842 | 868 | ||
| 843 | conf->nr_pending++; | 869 | atomic_inc(&conf->nr_pending[idx]); |
| 844 | spin_unlock_irq(&conf->resync_lock); | 870 | spin_unlock_irq(&conf->resync_lock); |
| 845 | } | 871 | } |
| 846 | 872 | ||
| 847 | static void lower_barrier(struct r1conf *conf) | 873 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr) |
| 848 | { | 874 | { |
| 849 | unsigned long flags; | 875 | int idx = sector_to_idx(sector_nr); |
| 850 | BUG_ON(conf->barrier <= 0); | 876 | |
| 851 | spin_lock_irqsave(&conf->resync_lock, flags); | 877 | BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); |
| 852 | conf->barrier--; | 878 | |
| 853 | conf->nr_pending--; | 879 | atomic_dec(&conf->barrier[idx]); |
| 854 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 880 | atomic_dec(&conf->nr_pending[idx]); |
| 855 | wake_up(&conf->wait_barrier); | 881 | wake_up(&conf->wait_barrier); |
| 856 | } | 882 | } |
| 857 | 883 | ||
| 858 | static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | 884 | static void _wait_barrier(struct r1conf *conf, int idx) |
| 859 | { | 885 | { |
| 860 | bool wait = false; | 886 | /* |
| 887 | * We need to increase conf->nr_pending[idx] very early here, | ||
| 888 | * then raise_barrier() can be blocked when it waits for | ||
| 889 | * conf->nr_pending[idx] to be 0. Then we can avoid holding | ||
| 890 | * conf->resync_lock when there is no barrier raised in same | ||
| 891 | * barrier unit bucket. Also if the array is frozen, I/O | ||
| 892 | * should be blocked until array is unfrozen. | ||
| 893 | */ | ||
| 894 | atomic_inc(&conf->nr_pending[idx]); | ||
| 895 | /* | ||
| 896 | * In _wait_barrier() we firstly increase conf->nr_pending[idx], then | ||
| 897 | * check conf->barrier[idx]. In raise_barrier() we firstly increase | ||
| 898 | * conf->barrier[idx], then check conf->nr_pending[idx]. A memory | ||
| 899 | * barrier is necessary here to make sure conf->barrier[idx] won't be | ||
| 900 | * fetched before conf->nr_pending[idx] is increased. Otherwise there | ||
| 901 | * will be a race between _wait_barrier() and raise_barrier(). | ||
| 902 | */ | ||
| 903 | smp_mb__after_atomic(); | ||
| 861 | 904 | ||
| 862 | if (conf->array_frozen || !bio) | 905 | /* |
| 863 | wait = true; | 906 | * Don't worry about checking two atomic_t variables at same time |
| 864 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | 907 | * here. If during we check conf->barrier[idx], the array is |
| 865 | if ((conf->mddev->curr_resync_completed | 908 | * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is |
| 866 | >= bio_end_sector(bio)) || | 909 | * 0, it is safe to return and make the I/O continue. Because the |
| 867 | (conf->start_next_window + NEXT_NORMALIO_DISTANCE | 910 | * array is frozen, all I/O returned here will eventually complete |
| 868 | <= bio->bi_iter.bi_sector)) | 911 | * or be queued, no race will happen. See code comment in |
| 869 | wait = false; | 912 | * frozen_array(). |
| 870 | else | 913 | */ |
| 871 | wait = true; | 914 | if (!READ_ONCE(conf->array_frozen) && |
| 872 | } | 915 | !atomic_read(&conf->barrier[idx])) |
| 916 | return; | ||
| 873 | 917 | ||
| 874 | return wait; | 918 | /* |
| 919 | * After holding conf->resync_lock, conf->nr_pending[idx] | ||
| 920 | * should be decreased before waiting for barrier to drop. | ||
| 921 | * Otherwise, we may encounter a race condition because | ||
| 922 | * raise_barrer() might be waiting for conf->nr_pending[idx] | ||
| 923 | * to be 0 at same time. | ||
| 924 | */ | ||
| 925 | spin_lock_irq(&conf->resync_lock); | ||
| 926 | atomic_inc(&conf->nr_waiting[idx]); | ||
| 927 | atomic_dec(&conf->nr_pending[idx]); | ||
| 928 | /* | ||
| 929 | * In case freeze_array() is waiting for | ||
| 930 | * get_unqueued_pending() == extra | ||
| 931 | */ | ||
| 932 | wake_up(&conf->wait_barrier); | ||
| 933 | /* Wait for the barrier in same barrier unit bucket to drop. */ | ||
| 934 | wait_event_lock_irq(conf->wait_barrier, | ||
| 935 | !conf->array_frozen && | ||
| 936 | !atomic_read(&conf->barrier[idx]), | ||
| 937 | conf->resync_lock); | ||
| 938 | atomic_inc(&conf->nr_pending[idx]); | ||
| 939 | atomic_dec(&conf->nr_waiting[idx]); | ||
| 940 | spin_unlock_irq(&conf->resync_lock); | ||
| 875 | } | 941 | } |
| 876 | 942 | ||
| 877 | static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | 943 | static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) |
| 878 | { | 944 | { |
| 879 | sector_t sector = 0; | 945 | int idx = sector_to_idx(sector_nr); |
| 880 | 946 | ||
| 881 | spin_lock_irq(&conf->resync_lock); | 947 | /* |
| 882 | if (need_to_wait_for_sync(conf, bio)) { | 948 | * Very similar to _wait_barrier(). The difference is, for read |
| 883 | conf->nr_waiting++; | 949 | * I/O we don't need wait for sync I/O, but if the whole array |
| 884 | /* Wait for the barrier to drop. | 950 | * is frozen, the read I/O still has to wait until the array is |
| 885 | * However if there are already pending | 951 | * unfrozen. Since there is no ordering requirement with |
| 886 | * requests (preventing the barrier from | 952 | * conf->barrier[idx] here, memory barrier is unnecessary as well. |
| 887 | * rising completely), and the | 953 | */ |
| 888 | * per-process bio queue isn't empty, | 954 | atomic_inc(&conf->nr_pending[idx]); |
| 889 | * then don't wait, as we need to empty | ||
| 890 | * that queue to allow conf->start_next_window | ||
| 891 | * to increase. | ||
| 892 | */ | ||
| 893 | raid1_log(conf->mddev, "wait barrier"); | ||
| 894 | wait_event_lock_irq(conf->wait_barrier, | ||
| 895 | !conf->array_frozen && | ||
| 896 | (!conf->barrier || | ||
| 897 | ((conf->start_next_window < | ||
| 898 | conf->next_resync + RESYNC_SECTORS) && | ||
| 899 | current->bio_list && | ||
| 900 | !bio_list_empty(current->bio_list))), | ||
| 901 | conf->resync_lock); | ||
| 902 | conf->nr_waiting--; | ||
| 903 | } | ||
| 904 | |||
| 905 | if (bio && bio_data_dir(bio) == WRITE) { | ||
| 906 | if (bio->bi_iter.bi_sector >= conf->next_resync) { | ||
| 907 | if (conf->start_next_window == MaxSector) | ||
| 908 | conf->start_next_window = | ||
| 909 | conf->next_resync + | ||
| 910 | NEXT_NORMALIO_DISTANCE; | ||
| 911 | |||
| 912 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||
| 913 | <= bio->bi_iter.bi_sector) | ||
| 914 | conf->next_window_requests++; | ||
| 915 | else | ||
| 916 | conf->current_window_requests++; | ||
| 917 | sector = conf->start_next_window; | ||
| 918 | } | ||
| 919 | } | ||
| 920 | 955 | ||
| 921 | conf->nr_pending++; | 956 | if (!READ_ONCE(conf->array_frozen)) |
| 957 | return; | ||
| 958 | |||
| 959 | spin_lock_irq(&conf->resync_lock); | ||
| 960 | atomic_inc(&conf->nr_waiting[idx]); | ||
| 961 | atomic_dec(&conf->nr_pending[idx]); | ||
| 962 | /* | ||
| 963 | * In case freeze_array() is waiting for | ||
| 964 | * get_unqueued_pending() == extra | ||
| 965 | */ | ||
| 966 | wake_up(&conf->wait_barrier); | ||
| 967 | /* Wait for array to be unfrozen */ | ||
| 968 | wait_event_lock_irq(conf->wait_barrier, | ||
| 969 | !conf->array_frozen, | ||
| 970 | conf->resync_lock); | ||
| 971 | atomic_inc(&conf->nr_pending[idx]); | ||
| 972 | atomic_dec(&conf->nr_waiting[idx]); | ||
| 922 | spin_unlock_irq(&conf->resync_lock); | 973 | spin_unlock_irq(&conf->resync_lock); |
| 923 | return sector; | ||
| 924 | } | 974 | } |
| 925 | 975 | ||
| 926 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | 976 | static void wait_barrier(struct r1conf *conf, sector_t sector_nr) |
| 927 | sector_t bi_sector) | ||
| 928 | { | 977 | { |
| 929 | unsigned long flags; | 978 | int idx = sector_to_idx(sector_nr); |
| 930 | 979 | ||
| 931 | spin_lock_irqsave(&conf->resync_lock, flags); | 980 | _wait_barrier(conf, idx); |
| 932 | conf->nr_pending--; | 981 | } |
| 933 | if (start_next_window) { | 982 | |
| 934 | if (start_next_window == conf->start_next_window) { | 983 | static void wait_all_barriers(struct r1conf *conf) |
| 935 | if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | 984 | { |
| 936 | <= bi_sector) | 985 | int idx; |
| 937 | conf->next_window_requests--; | 986 | |
| 938 | else | 987 | for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) |
| 939 | conf->current_window_requests--; | 988 | _wait_barrier(conf, idx); |
| 940 | } else | 989 | } |
| 941 | conf->current_window_requests--; | 990 | |
| 942 | 991 | static void _allow_barrier(struct r1conf *conf, int idx) | |
| 943 | if (!conf->current_window_requests) { | 992 | { |
| 944 | if (conf->next_window_requests) { | 993 | atomic_dec(&conf->nr_pending[idx]); |
| 945 | conf->current_window_requests = | ||
| 946 | conf->next_window_requests; | ||
| 947 | conf->next_window_requests = 0; | ||
| 948 | conf->start_next_window += | ||
| 949 | NEXT_NORMALIO_DISTANCE; | ||
| 950 | } else | ||
| 951 | conf->start_next_window = MaxSector; | ||
| 952 | } | ||
| 953 | } | ||
| 954 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
| 955 | wake_up(&conf->wait_barrier); | 994 | wake_up(&conf->wait_barrier); |
| 956 | } | 995 | } |
| 957 | 996 | ||
| 997 | static void allow_barrier(struct r1conf *conf, sector_t sector_nr) | ||
| 998 | { | ||
| 999 | int idx = sector_to_idx(sector_nr); | ||
| 1000 | |||
| 1001 | _allow_barrier(conf, idx); | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void allow_all_barriers(struct r1conf *conf) | ||
| 1005 | { | ||
| 1006 | int idx; | ||
| 1007 | |||
| 1008 | for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | ||
| 1009 | _allow_barrier(conf, idx); | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | /* conf->resync_lock should be held */ | ||
| 1013 | static int get_unqueued_pending(struct r1conf *conf) | ||
| 1014 | { | ||
| 1015 | int idx, ret; | ||
| 1016 | |||
| 1017 | for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | ||
| 1018 | ret += atomic_read(&conf->nr_pending[idx]) - | ||
| 1019 | atomic_read(&conf->nr_queued[idx]); | ||
| 1020 | |||
| 1021 | return ret; | ||
| 1022 | } | ||
| 1023 | |||
| 958 | static void freeze_array(struct r1conf *conf, int extra) | 1024 | static void freeze_array(struct r1conf *conf, int extra) |
| 959 | { | 1025 | { |
| 960 | /* stop syncio and normal IO and wait for everything to | 1026 | /* Stop sync I/O and normal I/O and wait for everything to |
| 961 | * go quite. | 1027 | * go quite. |
| 962 | * We wait until nr_pending match nr_queued+extra | 1028 | * This is called in two situations: |
| 963 | * This is called in the context of one normal IO request | 1029 | * 1) management command handlers (reshape, remove disk, quiesce). |
| 964 | * that has failed. Thus any sync request that might be pending | 1030 | * 2) one normal I/O request failed. |
| 965 | * will be blocked by nr_pending, and we need to wait for | 1031 | |
| 966 | * pending IO requests to complete or be queued for re-try. | 1032 | * After array_frozen is set to 1, new sync IO will be blocked at |
| 967 | * Thus the number queued (nr_queued) plus this request (extra) | 1033 | * raise_barrier(), and new normal I/O will blocked at _wait_barrier() |
| 968 | * must match the number of pending IOs (nr_pending) before | 1034 | * or wait_read_barrier(). The flying I/Os will either complete or be |
| 969 | * we continue. | 1035 | * queued. When everything goes quite, there are only queued I/Os left. |
| 1036 | |||
| 1037 | * Every flying I/O contributes to a conf->nr_pending[idx], idx is the | ||
| 1038 | * barrier bucket index which this I/O request hits. When all sync and | ||
| 1039 | * normal I/O are queued, sum of all conf->nr_pending[] will match sum | ||
| 1040 | * of all conf->nr_queued[]. But normal I/O failure is an exception, | ||
| 1041 | * in handle_read_error(), we may call freeze_array() before trying to | ||
| 1042 | * fix the read error. In this case, the error read I/O is not queued, | ||
| 1043 | * so get_unqueued_pending() == 1. | ||
| 1044 | * | ||
| 1045 | * Therefore before this function returns, we need to wait until | ||
| 1046 | * get_unqueued_pendings(conf) gets equal to extra. For | ||
| 1047 | * normal I/O context, extra is 1, in rested situations extra is 0. | ||
| 970 | */ | 1048 | */ |
| 971 | spin_lock_irq(&conf->resync_lock); | 1049 | spin_lock_irq(&conf->resync_lock); |
| 972 | conf->array_frozen = 1; | 1050 | conf->array_frozen = 1; |
| 973 | raid1_log(conf->mddev, "wait freeze"); | 1051 | raid1_log(conf->mddev, "wait freeze"); |
| 974 | wait_event_lock_irq_cmd(conf->wait_barrier, | 1052 | wait_event_lock_irq_cmd( |
| 975 | conf->nr_pending == conf->nr_queued+extra, | 1053 | conf->wait_barrier, |
| 976 | conf->resync_lock, | 1054 | get_unqueued_pending(conf) == extra, |
| 977 | flush_pending_writes(conf)); | 1055 | conf->resync_lock, |
| 1056 | flush_pending_writes(conf)); | ||
| 978 | spin_unlock_irq(&conf->resync_lock); | 1057 | spin_unlock_irq(&conf->resync_lock); |
| 979 | } | 1058 | } |
| 980 | static void unfreeze_array(struct r1conf *conf) | 1059 | static void unfreeze_array(struct r1conf *conf) |
| @@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf) | |||
| 982 | /* reverse the effect of the freeze */ | 1061 | /* reverse the effect of the freeze */ |
| 983 | spin_lock_irq(&conf->resync_lock); | 1062 | spin_lock_irq(&conf->resync_lock); |
| 984 | conf->array_frozen = 0; | 1063 | conf->array_frozen = 0; |
| 985 | wake_up(&conf->wait_barrier); | ||
| 986 | spin_unlock_irq(&conf->resync_lock); | 1064 | spin_unlock_irq(&conf->resync_lock); |
| 1065 | wake_up(&conf->wait_barrier); | ||
| 987 | } | 1066 | } |
| 988 | 1067 | ||
| 989 | /* duplicate the data pages for behind I/O | 1068 | /* duplicate the data pages for behind I/O |
| @@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
| 1070 | kfree(plug); | 1149 | kfree(plug); |
| 1071 | } | 1150 | } |
| 1072 | 1151 | ||
| 1073 | static void raid1_read_request(struct mddev *mddev, struct bio *bio, | 1152 | static inline struct r1bio * |
| 1074 | struct r1bio *r1_bio) | 1153 | alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) |
| 1154 | { | ||
| 1155 | struct r1conf *conf = mddev->private; | ||
| 1156 | struct r1bio *r1_bio; | ||
| 1157 | |||
| 1158 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 1159 | |||
| 1160 | r1_bio->master_bio = bio; | ||
| 1161 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
| 1162 | r1_bio->state = 0; | ||
| 1163 | r1_bio->mddev = mddev; | ||
| 1164 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
| 1165 | |||
| 1166 | return r1_bio; | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | static void raid1_read_request(struct mddev *mddev, struct bio *bio) | ||
| 1075 | { | 1170 | { |
| 1076 | struct r1conf *conf = mddev->private; | 1171 | struct r1conf *conf = mddev->private; |
| 1077 | struct raid1_info *mirror; | 1172 | struct raid1_info *mirror; |
| 1173 | struct r1bio *r1_bio; | ||
| 1078 | struct bio *read_bio; | 1174 | struct bio *read_bio; |
| 1079 | struct bitmap *bitmap = mddev->bitmap; | 1175 | struct bitmap *bitmap = mddev->bitmap; |
| 1080 | const int op = bio_op(bio); | 1176 | const int op = bio_op(bio); |
| @@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, | |||
| 1083 | int max_sectors; | 1179 | int max_sectors; |
| 1084 | int rdisk; | 1180 | int rdisk; |
| 1085 | 1181 | ||
| 1086 | wait_barrier(conf, bio); | 1182 | /* |
| 1183 | * Still need barrier for READ in case that whole | ||
| 1184 | * array is frozen. | ||
| 1185 | */ | ||
| 1186 | wait_read_barrier(conf, bio->bi_iter.bi_sector); | ||
| 1187 | |||
| 1188 | r1_bio = alloc_r1bio(mddev, bio, 0); | ||
| 1087 | 1189 | ||
| 1190 | /* | ||
| 1191 | * We might need to issue multiple reads to different | ||
| 1192 | * devices if there are bad blocks around, so we keep | ||
| 1193 | * track of the number of reads in bio->bi_phys_segments. | ||
| 1194 | * If this is 0, there is only one r1_bio and no locking | ||
| 1195 | * will be needed when requests complete. If it is | ||
| 1196 | * non-zero, then it is the number of not-completed requests. | ||
| 1197 | */ | ||
| 1198 | bio->bi_phys_segments = 0; | ||
| 1199 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
| 1200 | |||
| 1201 | /* | ||
| 1202 | * make_request() can abort the operation when read-ahead is being | ||
| 1203 | * used and no empty request is available. | ||
| 1204 | */ | ||
| 1088 | read_again: | 1205 | read_again: |
| 1089 | rdisk = read_balance(conf, r1_bio, &max_sectors); | 1206 | rdisk = read_balance(conf, r1_bio, &max_sectors); |
| 1090 | 1207 | ||
| @@ -1106,9 +1223,8 @@ read_again: | |||
| 1106 | atomic_read(&bitmap->behind_writes) == 0); | 1223 | atomic_read(&bitmap->behind_writes) == 0); |
| 1107 | } | 1224 | } |
| 1108 | r1_bio->read_disk = rdisk; | 1225 | r1_bio->read_disk = rdisk; |
| 1109 | r1_bio->start_next_window = 0; | ||
| 1110 | 1226 | ||
| 1111 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1227 | read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
| 1112 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, | 1228 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, |
| 1113 | max_sectors); | 1229 | max_sectors); |
| 1114 | 1230 | ||
| @@ -1151,22 +1267,16 @@ read_again: | |||
| 1151 | */ | 1267 | */ |
| 1152 | reschedule_retry(r1_bio); | 1268 | reschedule_retry(r1_bio); |
| 1153 | 1269 | ||
| 1154 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 1270 | r1_bio = alloc_r1bio(mddev, bio, sectors_handled); |
| 1155 | |||
| 1156 | r1_bio->master_bio = bio; | ||
| 1157 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
| 1158 | r1_bio->state = 0; | ||
| 1159 | r1_bio->mddev = mddev; | ||
| 1160 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
| 1161 | goto read_again; | 1271 | goto read_again; |
| 1162 | } else | 1272 | } else |
| 1163 | generic_make_request(read_bio); | 1273 | generic_make_request(read_bio); |
| 1164 | } | 1274 | } |
| 1165 | 1275 | ||
| 1166 | static void raid1_write_request(struct mddev *mddev, struct bio *bio, | 1276 | static void raid1_write_request(struct mddev *mddev, struct bio *bio) |
| 1167 | struct r1bio *r1_bio) | ||
| 1168 | { | 1277 | { |
| 1169 | struct r1conf *conf = mddev->private; | 1278 | struct r1conf *conf = mddev->private; |
| 1279 | struct r1bio *r1_bio; | ||
| 1170 | int i, disks; | 1280 | int i, disks; |
| 1171 | struct bitmap *bitmap = mddev->bitmap; | 1281 | struct bitmap *bitmap = mddev->bitmap; |
| 1172 | unsigned long flags; | 1282 | unsigned long flags; |
| @@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1176 | int first_clone; | 1286 | int first_clone; |
| 1177 | int sectors_handled; | 1287 | int sectors_handled; |
| 1178 | int max_sectors; | 1288 | int max_sectors; |
| 1179 | sector_t start_next_window; | ||
| 1180 | 1289 | ||
| 1181 | /* | 1290 | /* |
| 1182 | * Register the new request and wait if the reconstruction | 1291 | * Register the new request and wait if the reconstruction |
| @@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1212 | } | 1321 | } |
| 1213 | finish_wait(&conf->wait_barrier, &w); | 1322 | finish_wait(&conf->wait_barrier, &w); |
| 1214 | } | 1323 | } |
| 1215 | start_next_window = wait_barrier(conf, bio); | 1324 | wait_barrier(conf, bio->bi_iter.bi_sector); |
| 1325 | |||
| 1326 | r1_bio = alloc_r1bio(mddev, bio, 0); | ||
| 1327 | |||
| 1328 | /* We might need to issue multiple writes to different | ||
| 1329 | * devices if there are bad blocks around, so we keep | ||
| 1330 | * track of the number of writes in bio->bi_phys_segments. | ||
| 1331 | * If this is 0, there is only one r1_bio and no locking | ||
| 1332 | * will be needed when requests complete. If it is | ||
| 1333 | * non-zero, then it is the number of not-completed requests. | ||
| 1334 | */ | ||
| 1335 | bio->bi_phys_segments = 0; | ||
| 1336 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
| 1216 | 1337 | ||
| 1217 | if (conf->pending_count >= max_queued_requests) { | 1338 | if (conf->pending_count >= max_queued_requests) { |
| 1218 | md_wakeup_thread(mddev->thread); | 1339 | md_wakeup_thread(mddev->thread); |
| @@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1233 | 1354 | ||
| 1234 | disks = conf->raid_disks * 2; | 1355 | disks = conf->raid_disks * 2; |
| 1235 | retry_write: | 1356 | retry_write: |
| 1236 | r1_bio->start_next_window = start_next_window; | ||
| 1237 | blocked_rdev = NULL; | 1357 | blocked_rdev = NULL; |
| 1238 | rcu_read_lock(); | 1358 | rcu_read_lock(); |
| 1239 | max_sectors = r1_bio->sectors; | 1359 | max_sectors = r1_bio->sectors; |
| @@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1300 | if (unlikely(blocked_rdev)) { | 1420 | if (unlikely(blocked_rdev)) { |
| 1301 | /* Wait for this device to become unblocked */ | 1421 | /* Wait for this device to become unblocked */ |
| 1302 | int j; | 1422 | int j; |
| 1303 | sector_t old = start_next_window; | ||
| 1304 | 1423 | ||
| 1305 | for (j = 0; j < i; j++) | 1424 | for (j = 0; j < i; j++) |
| 1306 | if (r1_bio->bios[j]) | 1425 | if (r1_bio->bios[j]) |
| 1307 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1426 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
| 1308 | r1_bio->state = 0; | 1427 | r1_bio->state = 0; |
| 1309 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); | 1428 | allow_barrier(conf, bio->bi_iter.bi_sector); |
| 1310 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | 1429 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); |
| 1311 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1430 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
| 1312 | start_next_window = wait_barrier(conf, bio); | 1431 | wait_barrier(conf, bio->bi_iter.bi_sector); |
| 1313 | /* | ||
| 1314 | * We must make sure the multi r1bios of bio have | ||
| 1315 | * the same value of bi_phys_segments | ||
| 1316 | */ | ||
| 1317 | if (bio->bi_phys_segments && old && | ||
| 1318 | old != start_next_window) | ||
| 1319 | /* Wait for the former r1bio(s) to complete */ | ||
| 1320 | wait_event(conf->wait_barrier, | ||
| 1321 | bio->bi_phys_segments == 1); | ||
| 1322 | goto retry_write; | 1432 | goto retry_write; |
| 1323 | } | 1433 | } |
| 1324 | 1434 | ||
| @@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1341 | 1451 | ||
| 1342 | first_clone = 1; | 1452 | first_clone = 1; |
| 1343 | for (i = 0; i < disks; i++) { | 1453 | for (i = 0; i < disks; i++) { |
| 1344 | struct bio *mbio; | 1454 | struct bio *mbio = NULL; |
| 1455 | sector_t offset; | ||
| 1345 | if (!r1_bio->bios[i]) | 1456 | if (!r1_bio->bios[i]) |
| 1346 | continue; | 1457 | continue; |
| 1347 | 1458 | ||
| 1348 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1459 | offset = r1_bio->sector - bio->bi_iter.bi_sector; |
| 1349 | bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, | ||
| 1350 | max_sectors); | ||
| 1351 | 1460 | ||
| 1352 | if (first_clone) { | 1461 | if (first_clone) { |
| 1353 | /* do behind I/O ? | 1462 | /* do behind I/O ? |
| @@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1357 | if (bitmap && | 1466 | if (bitmap && |
| 1358 | (atomic_read(&bitmap->behind_writes) | 1467 | (atomic_read(&bitmap->behind_writes) |
| 1359 | < mddev->bitmap_info.max_write_behind) && | 1468 | < mddev->bitmap_info.max_write_behind) && |
| 1360 | !waitqueue_active(&bitmap->behind_wait)) | 1469 | !waitqueue_active(&bitmap->behind_wait)) { |
| 1470 | mbio = bio_clone_bioset_partial(bio, GFP_NOIO, | ||
| 1471 | mddev->bio_set, | ||
| 1472 | offset << 9, | ||
| 1473 | max_sectors << 9); | ||
| 1361 | alloc_behind_pages(mbio, r1_bio); | 1474 | alloc_behind_pages(mbio, r1_bio); |
| 1475 | } | ||
| 1362 | 1476 | ||
| 1363 | bitmap_startwrite(bitmap, r1_bio->sector, | 1477 | bitmap_startwrite(bitmap, r1_bio->sector, |
| 1364 | r1_bio->sectors, | 1478 | r1_bio->sectors, |
| @@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1366 | &r1_bio->state)); | 1480 | &r1_bio->state)); |
| 1367 | first_clone = 0; | 1481 | first_clone = 0; |
| 1368 | } | 1482 | } |
| 1483 | |||
| 1484 | if (!mbio) { | ||
| 1485 | if (r1_bio->behind_bvecs) | ||
| 1486 | mbio = bio_clone_bioset_partial(bio, GFP_NOIO, | ||
| 1487 | mddev->bio_set, | ||
| 1488 | offset << 9, | ||
| 1489 | max_sectors << 9); | ||
| 1490 | else { | ||
| 1491 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); | ||
| 1492 | bio_trim(mbio, offset, max_sectors); | ||
| 1493 | } | ||
| 1494 | } | ||
| 1495 | |||
| 1369 | if (r1_bio->behind_bvecs) { | 1496 | if (r1_bio->behind_bvecs) { |
| 1370 | struct bio_vec *bvec; | 1497 | struct bio_vec *bvec; |
| 1371 | int j; | 1498 | int j; |
| @@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1385 | conf->mirrors[i].rdev->data_offset); | 1512 | conf->mirrors[i].rdev->data_offset); |
| 1386 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1513 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
| 1387 | mbio->bi_end_io = raid1_end_write_request; | 1514 | mbio->bi_end_io = raid1_end_write_request; |
| 1388 | mbio->bi_opf = bio_op(bio) | | 1515 | mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); |
| 1389 | (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA)); | ||
| 1390 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && | 1516 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && |
| 1391 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && | 1517 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && |
| 1392 | conf->raid_disks - mddev->degraded > 1) | 1518 | conf->raid_disks - mddev->degraded > 1) |
| @@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1427 | /* We need another r1_bio. It has already been counted | 1553 | /* We need another r1_bio. It has already been counted |
| 1428 | * in bio->bi_phys_segments | 1554 | * in bio->bi_phys_segments |
| 1429 | */ | 1555 | */ |
| 1430 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 1556 | r1_bio = alloc_r1bio(mddev, bio, sectors_handled); |
| 1431 | r1_bio->master_bio = bio; | ||
| 1432 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
| 1433 | r1_bio->state = 0; | ||
| 1434 | r1_bio->mddev = mddev; | ||
| 1435 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
| 1436 | goto retry_write; | 1557 | goto retry_write; |
| 1437 | } | 1558 | } |
| 1438 | 1559 | ||
| @@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, | |||
| 1444 | 1565 | ||
| 1445 | static void raid1_make_request(struct mddev *mddev, struct bio *bio) | 1566 | static void raid1_make_request(struct mddev *mddev, struct bio *bio) |
| 1446 | { | 1567 | { |
| 1447 | struct r1conf *conf = mddev->private; | 1568 | struct bio *split; |
| 1448 | struct r1bio *r1_bio; | 1569 | sector_t sectors; |
| 1449 | 1570 | ||
| 1450 | /* | 1571 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { |
| 1451 | * make_request() can abort the operation when read-ahead is being | 1572 | md_flush_request(mddev, bio); |
| 1452 | * used and no empty request is available. | 1573 | return; |
| 1453 | * | 1574 | } |
| 1454 | */ | ||
| 1455 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 1456 | |||
| 1457 | r1_bio->master_bio = bio; | ||
| 1458 | r1_bio->sectors = bio_sectors(bio); | ||
| 1459 | r1_bio->state = 0; | ||
| 1460 | r1_bio->mddev = mddev; | ||
| 1461 | r1_bio->sector = bio->bi_iter.bi_sector; | ||
| 1462 | 1575 | ||
| 1463 | /* | 1576 | /* if bio exceeds barrier unit boundary, split it */ |
| 1464 | * We might need to issue multiple reads to different devices if there | 1577 | do { |
| 1465 | * are bad blocks around, so we keep track of the number of reads in | 1578 | sectors = align_to_barrier_unit_end( |
| 1466 | * bio->bi_phys_segments. If this is 0, there is only one r1_bio and | 1579 | bio->bi_iter.bi_sector, bio_sectors(bio)); |
| 1467 | * no locking will be needed when requests complete. If it is | 1580 | if (sectors < bio_sectors(bio)) { |
| 1468 | * non-zero, then it is the number of not-completed requests. | 1581 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); |
| 1469 | */ | 1582 | bio_chain(split, bio); |
| 1470 | bio->bi_phys_segments = 0; | 1583 | } else { |
| 1471 | bio_clear_flag(bio, BIO_SEG_VALID); | 1584 | split = bio; |
| 1585 | } | ||
| 1472 | 1586 | ||
| 1473 | if (bio_data_dir(bio) == READ) | 1587 | if (bio_data_dir(split) == READ) |
| 1474 | raid1_read_request(mddev, bio, r1_bio); | 1588 | raid1_read_request(mddev, split); |
| 1475 | else | 1589 | else |
| 1476 | raid1_write_request(mddev, bio, r1_bio); | 1590 | raid1_write_request(mddev, split); |
| 1591 | } while (split != bio); | ||
| 1477 | } | 1592 | } |
| 1478 | 1593 | ||
| 1479 | static void raid1_status(struct seq_file *seq, struct mddev *mddev) | 1594 | static void raid1_status(struct seq_file *seq, struct mddev *mddev) |
| @@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf) | |||
| 1564 | 1679 | ||
| 1565 | static void close_sync(struct r1conf *conf) | 1680 | static void close_sync(struct r1conf *conf) |
| 1566 | { | 1681 | { |
| 1567 | wait_barrier(conf, NULL); | 1682 | wait_all_barriers(conf); |
| 1568 | allow_barrier(conf, 0, 0); | 1683 | allow_all_barriers(conf); |
| 1569 | 1684 | ||
| 1570 | mempool_destroy(conf->r1buf_pool); | 1685 | mempool_destroy(conf->r1buf_pool); |
| 1571 | conf->r1buf_pool = NULL; | 1686 | conf->r1buf_pool = NULL; |
| 1572 | |||
| 1573 | spin_lock_irq(&conf->resync_lock); | ||
| 1574 | conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; | ||
| 1575 | conf->start_next_window = MaxSector; | ||
| 1576 | conf->current_window_requests += | ||
| 1577 | conf->next_window_requests; | ||
| 1578 | conf->next_window_requests = 0; | ||
| 1579 | spin_unlock_irq(&conf->resync_lock); | ||
| 1580 | } | 1687 | } |
| 1581 | 1688 | ||
| 1582 | static int raid1_spare_active(struct mddev *mddev) | 1689 | static int raid1_spare_active(struct mddev *mddev) |
| @@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) | |||
| 2273 | 2380 | ||
| 2274 | wbio->bi_vcnt = vcnt; | 2381 | wbio->bi_vcnt = vcnt; |
| 2275 | } else { | 2382 | } else { |
| 2276 | wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2383 | wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, |
| 2384 | mddev->bio_set); | ||
| 2277 | } | 2385 | } |
| 2278 | 2386 | ||
| 2279 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); | 2387 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); |
| @@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
| 2323 | 2431 | ||
| 2324 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | 2432 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) |
| 2325 | { | 2433 | { |
| 2326 | int m; | 2434 | int m, idx; |
| 2327 | bool fail = false; | 2435 | bool fail = false; |
| 2436 | |||
| 2328 | for (m = 0; m < conf->raid_disks * 2 ; m++) | 2437 | for (m = 0; m < conf->raid_disks * 2 ; m++) |
| 2329 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | 2438 | if (r1_bio->bios[m] == IO_MADE_GOOD) { |
| 2330 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 2439 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
| @@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | |||
| 2350 | if (fail) { | 2459 | if (fail) { |
| 2351 | spin_lock_irq(&conf->device_lock); | 2460 | spin_lock_irq(&conf->device_lock); |
| 2352 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); | 2461 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); |
| 2353 | conf->nr_queued++; | 2462 | idx = sector_to_idx(r1_bio->sector); |
| 2463 | atomic_inc(&conf->nr_queued[idx]); | ||
| 2354 | spin_unlock_irq(&conf->device_lock); | 2464 | spin_unlock_irq(&conf->device_lock); |
| 2465 | /* | ||
| 2466 | * In case freeze_array() is waiting for condition | ||
| 2467 | * get_unqueued_pending() == extra to be true. | ||
| 2468 | */ | ||
| 2469 | wake_up(&conf->wait_barrier); | ||
| 2355 | md_wakeup_thread(conf->mddev->thread); | 2470 | md_wakeup_thread(conf->mddev->thread); |
| 2356 | } else { | 2471 | } else { |
| 2357 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2472 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
| @@ -2411,7 +2526,8 @@ read_more: | |||
| 2411 | const unsigned long do_sync | 2526 | const unsigned long do_sync |
| 2412 | = r1_bio->master_bio->bi_opf & REQ_SYNC; | 2527 | = r1_bio->master_bio->bi_opf & REQ_SYNC; |
| 2413 | r1_bio->read_disk = disk; | 2528 | r1_bio->read_disk = disk; |
| 2414 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2529 | bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, |
| 2530 | mddev->bio_set); | ||
| 2415 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, | 2531 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, |
| 2416 | max_sectors); | 2532 | max_sectors); |
| 2417 | r1_bio->bios[r1_bio->read_disk] = bio; | 2533 | r1_bio->bios[r1_bio->read_disk] = bio; |
| @@ -2445,15 +2561,8 @@ read_more: | |||
| 2445 | generic_make_request(bio); | 2561 | generic_make_request(bio); |
| 2446 | bio = NULL; | 2562 | bio = NULL; |
| 2447 | 2563 | ||
| 2448 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | 2564 | r1_bio = alloc_r1bio(mddev, mbio, sectors_handled); |
| 2449 | |||
| 2450 | r1_bio->master_bio = mbio; | ||
| 2451 | r1_bio->sectors = bio_sectors(mbio) - sectors_handled; | ||
| 2452 | r1_bio->state = 0; | ||
| 2453 | set_bit(R1BIO_ReadError, &r1_bio->state); | 2565 | set_bit(R1BIO_ReadError, &r1_bio->state); |
| 2454 | r1_bio->mddev = mddev; | ||
| 2455 | r1_bio->sector = mbio->bi_iter.bi_sector + | ||
| 2456 | sectors_handled; | ||
| 2457 | 2566 | ||
| 2458 | goto read_more; | 2567 | goto read_more; |
| 2459 | } else { | 2568 | } else { |
| @@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread) | |||
| 2472 | struct r1conf *conf = mddev->private; | 2581 | struct r1conf *conf = mddev->private; |
| 2473 | struct list_head *head = &conf->retry_list; | 2582 | struct list_head *head = &conf->retry_list; |
| 2474 | struct blk_plug plug; | 2583 | struct blk_plug plug; |
| 2584 | int idx; | ||
| 2475 | 2585 | ||
| 2476 | md_check_recovery(mddev); | 2586 | md_check_recovery(mddev); |
| 2477 | 2587 | ||
| @@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread) | |||
| 2479 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { | 2589 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 2480 | LIST_HEAD(tmp); | 2590 | LIST_HEAD(tmp); |
| 2481 | spin_lock_irqsave(&conf->device_lock, flags); | 2591 | spin_lock_irqsave(&conf->device_lock, flags); |
| 2482 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { | 2592 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
| 2483 | while (!list_empty(&conf->bio_end_io_list)) { | 2593 | list_splice_init(&conf->bio_end_io_list, &tmp); |
| 2484 | list_move(conf->bio_end_io_list.prev, &tmp); | ||
| 2485 | conf->nr_queued--; | ||
| 2486 | } | ||
| 2487 | } | ||
| 2488 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2594 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 2489 | while (!list_empty(&tmp)) { | 2595 | while (!list_empty(&tmp)) { |
| 2490 | r1_bio = list_first_entry(&tmp, struct r1bio, | 2596 | r1_bio = list_first_entry(&tmp, struct r1bio, |
| 2491 | retry_list); | 2597 | retry_list); |
| 2492 | list_del(&r1_bio->retry_list); | 2598 | list_del(&r1_bio->retry_list); |
| 2599 | idx = sector_to_idx(r1_bio->sector); | ||
| 2600 | atomic_dec(&conf->nr_queued[idx]); | ||
| 2493 | if (mddev->degraded) | 2601 | if (mddev->degraded) |
| 2494 | set_bit(R1BIO_Degraded, &r1_bio->state); | 2602 | set_bit(R1BIO_Degraded, &r1_bio->state); |
| 2495 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2603 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
| @@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread) | |||
| 2510 | } | 2618 | } |
| 2511 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); | 2619 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); |
| 2512 | list_del(head->prev); | 2620 | list_del(head->prev); |
| 2513 | conf->nr_queued--; | 2621 | idx = sector_to_idx(r1_bio->sector); |
| 2622 | atomic_dec(&conf->nr_queued[idx]); | ||
| 2514 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2623 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 2515 | 2624 | ||
| 2516 | mddev = r1_bio->mddev; | 2625 | mddev = r1_bio->mddev; |
| @@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf) | |||
| 2549 | conf->poolinfo); | 2658 | conf->poolinfo); |
| 2550 | if (!conf->r1buf_pool) | 2659 | if (!conf->r1buf_pool) |
| 2551 | return -ENOMEM; | 2660 | return -ENOMEM; |
| 2552 | conf->next_resync = 0; | ||
| 2553 | return 0; | 2661 | return 0; |
| 2554 | } | 2662 | } |
| 2555 | 2663 | ||
| @@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2578 | int still_degraded = 0; | 2686 | int still_degraded = 0; |
| 2579 | int good_sectors = RESYNC_SECTORS; | 2687 | int good_sectors = RESYNC_SECTORS; |
| 2580 | int min_bad = 0; /* number of sectors that are bad in all devices */ | 2688 | int min_bad = 0; /* number of sectors that are bad in all devices */ |
| 2689 | int idx = sector_to_idx(sector_nr); | ||
| 2581 | 2690 | ||
| 2582 | if (!conf->r1buf_pool) | 2691 | if (!conf->r1buf_pool) |
| 2583 | if (init_resync(conf)) | 2692 | if (init_resync(conf)) |
| @@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2627 | * If there is non-resync activity waiting for a turn, then let it | 2736 | * If there is non-resync activity waiting for a turn, then let it |
| 2628 | * though before starting on this new sync request. | 2737 | * though before starting on this new sync request. |
| 2629 | */ | 2738 | */ |
| 2630 | if (conf->nr_waiting) | 2739 | if (atomic_read(&conf->nr_waiting[idx])) |
| 2631 | schedule_timeout_uninterruptible(1); | 2740 | schedule_timeout_uninterruptible(1); |
| 2632 | 2741 | ||
| 2633 | /* we are incrementing sector_nr below. To be safe, we check against | 2742 | /* we are incrementing sector_nr below. To be safe, we check against |
| @@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2654 | r1_bio->sector = sector_nr; | 2763 | r1_bio->sector = sector_nr; |
| 2655 | r1_bio->state = 0; | 2764 | r1_bio->state = 0; |
| 2656 | set_bit(R1BIO_IsSync, &r1_bio->state); | 2765 | set_bit(R1BIO_IsSync, &r1_bio->state); |
| 2766 | /* make sure good_sectors won't go across barrier unit boundary */ | ||
| 2767 | good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); | ||
| 2657 | 2768 | ||
| 2658 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2769 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2659 | struct md_rdev *rdev; | 2770 | struct md_rdev *rdev; |
| @@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2884 | if (!conf) | 2995 | if (!conf) |
| 2885 | goto abort; | 2996 | goto abort; |
| 2886 | 2997 | ||
| 2998 | conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, | ||
| 2999 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3000 | if (!conf->nr_pending) | ||
| 3001 | goto abort; | ||
| 3002 | |||
| 3003 | conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, | ||
| 3004 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3005 | if (!conf->nr_waiting) | ||
| 3006 | goto abort; | ||
| 3007 | |||
| 3008 | conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, | ||
| 3009 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3010 | if (!conf->nr_queued) | ||
| 3011 | goto abort; | ||
| 3012 | |||
| 3013 | conf->barrier = kcalloc(BARRIER_BUCKETS_NR, | ||
| 3014 | sizeof(atomic_t), GFP_KERNEL); | ||
| 3015 | if (!conf->barrier) | ||
| 3016 | goto abort; | ||
| 3017 | |||
| 2887 | conf->mirrors = kzalloc(sizeof(struct raid1_info) | 3018 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
| 2888 | * mddev->raid_disks * 2, | 3019 | * mddev->raid_disks * 2, |
| 2889 | GFP_KERNEL); | 3020 | GFP_KERNEL); |
| @@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2939 | conf->pending_count = 0; | 3070 | conf->pending_count = 0; |
| 2940 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 3071 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 2941 | 3072 | ||
| 2942 | conf->start_next_window = MaxSector; | ||
| 2943 | conf->current_window_requests = conf->next_window_requests = 0; | ||
| 2944 | |||
| 2945 | err = -EIO; | 3073 | err = -EIO; |
| 2946 | for (i = 0; i < conf->raid_disks * 2; i++) { | 3074 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2947 | 3075 | ||
| @@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2984 | kfree(conf->mirrors); | 3112 | kfree(conf->mirrors); |
| 2985 | safe_put_page(conf->tmppage); | 3113 | safe_put_page(conf->tmppage); |
| 2986 | kfree(conf->poolinfo); | 3114 | kfree(conf->poolinfo); |
| 3115 | kfree(conf->nr_pending); | ||
| 3116 | kfree(conf->nr_waiting); | ||
| 3117 | kfree(conf->nr_queued); | ||
| 3118 | kfree(conf->barrier); | ||
| 2987 | kfree(conf); | 3119 | kfree(conf); |
| 2988 | } | 3120 | } |
| 2989 | return ERR_PTR(err); | 3121 | return ERR_PTR(err); |
| @@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv) | |||
| 3085 | kfree(conf->mirrors); | 3217 | kfree(conf->mirrors); |
| 3086 | safe_put_page(conf->tmppage); | 3218 | safe_put_page(conf->tmppage); |
| 3087 | kfree(conf->poolinfo); | 3219 | kfree(conf->poolinfo); |
| 3220 | kfree(conf->nr_pending); | ||
| 3221 | kfree(conf->nr_waiting); | ||
| 3222 | kfree(conf->nr_queued); | ||
| 3223 | kfree(conf->barrier); | ||
| 3088 | kfree(conf); | 3224 | kfree(conf); |
| 3089 | } | 3225 | } |
| 3090 | 3226 | ||
