diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
commit | 6d6e352c80f22c446d933ca8103e02bac1f09129 (patch) | |
tree | 248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 /drivers/md/raid1.c | |
parent | b4789b8e6be3151a955ade74872822f30e8cd914 (diff) | |
parent | 60aaf933854511630e16be4efe0f96485e132de4 (diff) |
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown:
"Mostly optimisations and obscure bug fixes.
- raid5 gets less lock contention
- raid1 gets less contention between normal-io and resync-io during
resync"
* tag 'md/3.13' of git://neil.brown.name/md:
md/raid5: Use conf->device_lock protect changing of multi-thread resources.
md/raid5: Before freeing old multi-thread worker, it should flush them.
md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE.
UAPI: include <asm/byteorder.h> in linux/raid/md_p.h
raid1: Rewrite the implementation of iobarrier.
raid1: Add some macros to make code clearly.
raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array.
raid1: Add a field array_frozen to indicate whether raid in freeze state.
md: Convert use of typedef ctl_table to struct ctl_table
md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread.
md: fix some places where mddev_lock return value is not checked.
raid5: Retry R5_ReadNoMerge flag when hit a read error.
raid5: relieve lock contention in get_active_stripe()
raid5: relieve lock contention in get_active_stripe()
wait: add wait_event_cmd()
md/raid5.c: add proper locking to error path of raid5_start_reshape.
md: fix calculation of stacking limits on level change.
raid5: Use slow_path to release stripe when mddev->thread is null
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 162 |
1 files changed, 132 insertions, 30 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af6681b19776..1e5a540995e9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -66,7 +66,8 @@ | |||
66 | */ | 66 | */ |
67 | static int max_queued_requests = 1024; | 67 | static int max_queued_requests = 1024; |
68 | 68 | ||
69 | static void allow_barrier(struct r1conf *conf); | 69 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, |
70 | sector_t bi_sector); | ||
70 | static void lower_barrier(struct r1conf *conf); | 71 | static void lower_barrier(struct r1conf *conf); |
71 | 72 | ||
72 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 73 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) | |||
84 | } | 85 | } |
85 | 86 | ||
86 | #define RESYNC_BLOCK_SIZE (64*1024) | 87 | #define RESYNC_BLOCK_SIZE (64*1024) |
87 | //#define RESYNC_BLOCK_SIZE PAGE_SIZE | 88 | #define RESYNC_DEPTH 32 |
88 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) | 89 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) |
89 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) | 90 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) |
90 | #define RESYNC_WINDOW (2048*1024) | 91 | #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) |
92 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | ||
93 | #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||
91 | 94 | ||
92 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | 95 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) |
93 | { | 96 | { |
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
225 | struct bio *bio = r1_bio->master_bio; | 228 | struct bio *bio = r1_bio->master_bio; |
226 | int done; | 229 | int done; |
227 | struct r1conf *conf = r1_bio->mddev->private; | 230 | struct r1conf *conf = r1_bio->mddev->private; |
231 | sector_t start_next_window = r1_bio->start_next_window; | ||
232 | sector_t bi_sector = bio->bi_sector; | ||
228 | 233 | ||
229 | if (bio->bi_phys_segments) { | 234 | if (bio->bi_phys_segments) { |
230 | unsigned long flags; | 235 | unsigned long flags; |
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
232 | bio->bi_phys_segments--; | 237 | bio->bi_phys_segments--; |
233 | done = (bio->bi_phys_segments == 0); | 238 | done = (bio->bi_phys_segments == 0); |
234 | spin_unlock_irqrestore(&conf->device_lock, flags); | 239 | spin_unlock_irqrestore(&conf->device_lock, flags); |
240 | /* | ||
241 | * make_request() might be waiting for | ||
242 | * bi_phys_segments to decrease | ||
243 | */ | ||
244 | wake_up(&conf->wait_barrier); | ||
235 | } else | 245 | } else |
236 | done = 1; | 246 | done = 1; |
237 | 247 | ||
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
243 | * Wake up any possible resync thread that waits for the device | 253 | * Wake up any possible resync thread that waits for the device |
244 | * to go idle. | 254 | * to go idle. |
245 | */ | 255 | */ |
246 | allow_barrier(conf); | 256 | allow_barrier(conf, start_next_window, bi_sector); |
247 | } | 257 | } |
248 | } | 258 | } |
249 | 259 | ||
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) | |||
814 | * there is no normal IO happeing. It must arrange to call | 824 | * there is no normal IO happeing. It must arrange to call |
815 | * lower_barrier when the particular background IO completes. | 825 | * lower_barrier when the particular background IO completes. |
816 | */ | 826 | */ |
817 | #define RESYNC_DEPTH 32 | ||
818 | |||
819 | static void raise_barrier(struct r1conf *conf) | 827 | static void raise_barrier(struct r1conf *conf) |
820 | { | 828 | { |
821 | spin_lock_irq(&conf->resync_lock); | 829 | spin_lock_irq(&conf->resync_lock); |
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) | |||
827 | /* block any new IO from starting */ | 835 | /* block any new IO from starting */ |
828 | conf->barrier++; | 836 | conf->barrier++; |
829 | 837 | ||
830 | /* Now wait for all pending IO to complete */ | 838 | /* For these conditions we must wait: |
839 | * A: while the array is in frozen state | ||
840 | * B: while barrier >= RESYNC_DEPTH, meaning resync reach | ||
841 | * the max count which allowed. | ||
842 | * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | ||
843 | * next resync will reach to the window which normal bios are | ||
844 | * handling. | ||
845 | */ | ||
831 | wait_event_lock_irq(conf->wait_barrier, | 846 | wait_event_lock_irq(conf->wait_barrier, |
832 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 847 | !conf->array_frozen && |
848 | conf->barrier < RESYNC_DEPTH && | ||
849 | (conf->start_next_window >= | ||
850 | conf->next_resync + RESYNC_SECTORS), | ||
833 | conf->resync_lock); | 851 | conf->resync_lock); |
834 | 852 | ||
835 | spin_unlock_irq(&conf->resync_lock); | 853 | spin_unlock_irq(&conf->resync_lock); |
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) | |||
845 | wake_up(&conf->wait_barrier); | 863 | wake_up(&conf->wait_barrier); |
846 | } | 864 | } |
847 | 865 | ||
848 | static void wait_barrier(struct r1conf *conf) | 866 | static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) |
849 | { | 867 | { |
868 | bool wait = false; | ||
869 | |||
870 | if (conf->array_frozen || !bio) | ||
871 | wait = true; | ||
872 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | ||
873 | if (conf->next_resync < RESYNC_WINDOW_SECTORS) | ||
874 | wait = true; | ||
875 | else if ((conf->next_resync - RESYNC_WINDOW_SECTORS | ||
876 | >= bio_end_sector(bio)) || | ||
877 | (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||
878 | <= bio->bi_sector)) | ||
879 | wait = false; | ||
880 | else | ||
881 | wait = true; | ||
882 | } | ||
883 | |||
884 | return wait; | ||
885 | } | ||
886 | |||
887 | static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | ||
888 | { | ||
889 | sector_t sector = 0; | ||
890 | |||
850 | spin_lock_irq(&conf->resync_lock); | 891 | spin_lock_irq(&conf->resync_lock); |
851 | if (conf->barrier) { | 892 | if (need_to_wait_for_sync(conf, bio)) { |
852 | conf->nr_waiting++; | 893 | conf->nr_waiting++; |
853 | /* Wait for the barrier to drop. | 894 | /* Wait for the barrier to drop. |
854 | * However if there are already pending | 895 | * However if there are already pending |
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) | |||
860 | * count down. | 901 | * count down. |
861 | */ | 902 | */ |
862 | wait_event_lock_irq(conf->wait_barrier, | 903 | wait_event_lock_irq(conf->wait_barrier, |
863 | !conf->barrier || | 904 | !conf->array_frozen && |
864 | (conf->nr_pending && | 905 | (!conf->barrier || |
906 | ((conf->start_next_window < | ||
907 | conf->next_resync + RESYNC_SECTORS) && | ||
865 | current->bio_list && | 908 | current->bio_list && |
866 | !bio_list_empty(current->bio_list)), | 909 | !bio_list_empty(current->bio_list))), |
867 | conf->resync_lock); | 910 | conf->resync_lock); |
868 | conf->nr_waiting--; | 911 | conf->nr_waiting--; |
869 | } | 912 | } |
913 | |||
914 | if (bio && bio_data_dir(bio) == WRITE) { | ||
915 | if (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||
916 | <= bio->bi_sector) { | ||
917 | if (conf->start_next_window == MaxSector) | ||
918 | conf->start_next_window = | ||
919 | conf->next_resync + | ||
920 | NEXT_NORMALIO_DISTANCE; | ||
921 | |||
922 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||
923 | <= bio->bi_sector) | ||
924 | conf->next_window_requests++; | ||
925 | else | ||
926 | conf->current_window_requests++; | ||
927 | } | ||
928 | if (bio->bi_sector >= conf->start_next_window) | ||
929 | sector = conf->start_next_window; | ||
930 | } | ||
931 | |||
870 | conf->nr_pending++; | 932 | conf->nr_pending++; |
871 | spin_unlock_irq(&conf->resync_lock); | 933 | spin_unlock_irq(&conf->resync_lock); |
934 | return sector; | ||
872 | } | 935 | } |
873 | 936 | ||
874 | static void allow_barrier(struct r1conf *conf) | 937 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, |
938 | sector_t bi_sector) | ||
875 | { | 939 | { |
876 | unsigned long flags; | 940 | unsigned long flags; |
941 | |||
877 | spin_lock_irqsave(&conf->resync_lock, flags); | 942 | spin_lock_irqsave(&conf->resync_lock, flags); |
878 | conf->nr_pending--; | 943 | conf->nr_pending--; |
944 | if (start_next_window) { | ||
945 | if (start_next_window == conf->start_next_window) { | ||
946 | if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | ||
947 | <= bi_sector) | ||
948 | conf->next_window_requests--; | ||
949 | else | ||
950 | conf->current_window_requests--; | ||
951 | } else | ||
952 | conf->current_window_requests--; | ||
953 | |||
954 | if (!conf->current_window_requests) { | ||
955 | if (conf->next_window_requests) { | ||
956 | conf->current_window_requests = | ||
957 | conf->next_window_requests; | ||
958 | conf->next_window_requests = 0; | ||
959 | conf->start_next_window += | ||
960 | NEXT_NORMALIO_DISTANCE; | ||
961 | } else | ||
962 | conf->start_next_window = MaxSector; | ||
963 | } | ||
964 | } | ||
879 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 965 | spin_unlock_irqrestore(&conf->resync_lock, flags); |
880 | wake_up(&conf->wait_barrier); | 966 | wake_up(&conf->wait_barrier); |
881 | } | 967 | } |
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
884 | { | 970 | { |
885 | /* stop syncio and normal IO and wait for everything to | 971 | /* stop syncio and normal IO and wait for everything to |
886 | * go quite. | 972 | * go quite. |
887 | * We increment barrier and nr_waiting, and then | 973 | * We wait until nr_pending match nr_queued+extra |
888 | * wait until nr_pending match nr_queued+extra | ||
889 | * This is called in the context of one normal IO request | 974 | * This is called in the context of one normal IO request |
890 | * that has failed. Thus any sync request that might be pending | 975 | * that has failed. Thus any sync request that might be pending |
891 | * will be blocked by nr_pending, and we need to wait for | 976 | * will be blocked by nr_pending, and we need to wait for |
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
895 | * we continue. | 980 | * we continue. |
896 | */ | 981 | */ |
897 | spin_lock_irq(&conf->resync_lock); | 982 | spin_lock_irq(&conf->resync_lock); |
898 | conf->barrier++; | 983 | conf->array_frozen = 1; |
899 | conf->nr_waiting++; | ||
900 | wait_event_lock_irq_cmd(conf->wait_barrier, | 984 | wait_event_lock_irq_cmd(conf->wait_barrier, |
901 | conf->nr_pending == conf->nr_queued+extra, | 985 | conf->nr_pending == conf->nr_queued+extra, |
902 | conf->resync_lock, | 986 | conf->resync_lock, |
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) | |||
907 | { | 991 | { |
908 | /* reverse the effect of the freeze */ | 992 | /* reverse the effect of the freeze */ |
909 | spin_lock_irq(&conf->resync_lock); | 993 | spin_lock_irq(&conf->resync_lock); |
910 | conf->barrier--; | 994 | conf->array_frozen = 0; |
911 | conf->nr_waiting--; | ||
912 | wake_up(&conf->wait_barrier); | 995 | wake_up(&conf->wait_barrier); |
913 | spin_unlock_irq(&conf->resync_lock); | 996 | spin_unlock_irq(&conf->resync_lock); |
914 | } | 997 | } |
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1013 | int first_clone; | 1096 | int first_clone; |
1014 | int sectors_handled; | 1097 | int sectors_handled; |
1015 | int max_sectors; | 1098 | int max_sectors; |
1099 | sector_t start_next_window; | ||
1016 | 1100 | ||
1017 | /* | 1101 | /* |
1018 | * Register the new request and wait if the reconstruction | 1102 | * Register the new request and wait if the reconstruction |
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1042 | finish_wait(&conf->wait_barrier, &w); | 1126 | finish_wait(&conf->wait_barrier, &w); |
1043 | } | 1127 | } |
1044 | 1128 | ||
1045 | wait_barrier(conf); | 1129 | start_next_window = wait_barrier(conf, bio); |
1046 | 1130 | ||
1047 | bitmap = mddev->bitmap; | 1131 | bitmap = mddev->bitmap; |
1048 | 1132 | ||
@@ -1163,6 +1247,7 @@ read_again: | |||
1163 | 1247 | ||
1164 | disks = conf->raid_disks * 2; | 1248 | disks = conf->raid_disks * 2; |
1165 | retry_write: | 1249 | retry_write: |
1250 | r1_bio->start_next_window = start_next_window; | ||
1166 | blocked_rdev = NULL; | 1251 | blocked_rdev = NULL; |
1167 | rcu_read_lock(); | 1252 | rcu_read_lock(); |
1168 | max_sectors = r1_bio->sectors; | 1253 | max_sectors = r1_bio->sectors; |
@@ -1231,14 +1316,24 @@ read_again: | |||
1231 | if (unlikely(blocked_rdev)) { | 1316 | if (unlikely(blocked_rdev)) { |
1232 | /* Wait for this device to become unblocked */ | 1317 | /* Wait for this device to become unblocked */ |
1233 | int j; | 1318 | int j; |
1319 | sector_t old = start_next_window; | ||
1234 | 1320 | ||
1235 | for (j = 0; j < i; j++) | 1321 | for (j = 0; j < i; j++) |
1236 | if (r1_bio->bios[j]) | 1322 | if (r1_bio->bios[j]) |
1237 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1323 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
1238 | r1_bio->state = 0; | 1324 | r1_bio->state = 0; |
1239 | allow_barrier(conf); | 1325 | allow_barrier(conf, start_next_window, bio->bi_sector); |
1240 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1326 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1241 | wait_barrier(conf); | 1327 | start_next_window = wait_barrier(conf, bio); |
1328 | /* | ||
1329 | * We must make sure the multi r1bios of bio have | ||
1330 | * the same value of bi_phys_segments | ||
1331 | */ | ||
1332 | if (bio->bi_phys_segments && old && | ||
1333 | old != start_next_window) | ||
1334 | /* Wait for the former r1bio(s) to complete */ | ||
1335 | wait_event(conf->wait_barrier, | ||
1336 | bio->bi_phys_segments == 1); | ||
1242 | goto retry_write; | 1337 | goto retry_write; |
1243 | } | 1338 | } |
1244 | 1339 | ||
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) | |||
1438 | 1533 | ||
1439 | static void close_sync(struct r1conf *conf) | 1534 | static void close_sync(struct r1conf *conf) |
1440 | { | 1535 | { |
1441 | wait_barrier(conf); | 1536 | wait_barrier(conf, NULL); |
1442 | allow_barrier(conf); | 1537 | allow_barrier(conf, 0, 0); |
1443 | 1538 | ||
1444 | mempool_destroy(conf->r1buf_pool); | 1539 | mempool_destroy(conf->r1buf_pool); |
1445 | conf->r1buf_pool = NULL; | 1540 | conf->r1buf_pool = NULL; |
1541 | |||
1542 | conf->next_resync = 0; | ||
1543 | conf->start_next_window = MaxSector; | ||
1446 | } | 1544 | } |
1447 | 1545 | ||
1448 | static int raid1_spare_active(struct mddev *mddev) | 1546 | static int raid1_spare_active(struct mddev *mddev) |
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2714 | conf->pending_count = 0; | 2812 | conf->pending_count = 0; |
2715 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2813 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
2716 | 2814 | ||
2815 | conf->start_next_window = MaxSector; | ||
2816 | conf->current_window_requests = conf->next_window_requests = 0; | ||
2817 | |||
2717 | err = -EIO; | 2818 | err = -EIO; |
2718 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2819 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2719 | 2820 | ||
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) | |||
2871 | atomic_read(&bitmap->behind_writes) == 0); | 2972 | atomic_read(&bitmap->behind_writes) == 0); |
2872 | } | 2973 | } |
2873 | 2974 | ||
2874 | raise_barrier(conf); | 2975 | freeze_array(conf, 0); |
2875 | lower_barrier(conf); | 2976 | unfreeze_array(conf); |
2876 | 2977 | ||
2877 | md_unregister_thread(&mddev->thread); | 2978 | md_unregister_thread(&mddev->thread); |
2878 | if (conf->r1bio_pool) | 2979 | if (conf->r1bio_pool) |
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) | |||
3031 | wake_up(&conf->wait_barrier); | 3132 | wake_up(&conf->wait_barrier); |
3032 | break; | 3133 | break; |
3033 | case 1: | 3134 | case 1: |
3034 | raise_barrier(conf); | 3135 | freeze_array(conf, 0); |
3035 | break; | 3136 | break; |
3036 | case 0: | 3137 | case 0: |
3037 | lower_barrier(conf); | 3138 | unfreeze_array(conf); |
3038 | break; | 3139 | break; |
3039 | } | 3140 | } |
3040 | } | 3141 | } |
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) | |||
3051 | mddev->new_chunk_sectors = 0; | 3152 | mddev->new_chunk_sectors = 0; |
3052 | conf = setup_conf(mddev); | 3153 | conf = setup_conf(mddev); |
3053 | if (!IS_ERR(conf)) | 3154 | if (!IS_ERR(conf)) |
3054 | conf->barrier = 1; | 3155 | /* Array must appear to be quiesced */ |
3156 | conf->array_frozen = 1; | ||
3055 | return conf; | 3157 | return conf; |
3056 | } | 3158 | } |
3057 | return ERR_PTR(-EINVAL); | 3159 | return ERR_PTR(-EINVAL); |