diff options
author | colyli@suse.de <colyli@suse.de> | 2017-02-17 14:05:57 -0500 |
---|---|---|
committer | Shaohua Li <shli@fb.com> | 2017-02-20 01:04:25 -0500 |
commit | 824e47daddbfc6ebe1006b8659f080620472a136 (patch) | |
tree | b4a3076a35b2d13079349d82fe0203c9eb8879a2 /drivers/md/raid1.c | |
parent | fd76863e37fef26fe05547fddfa6e3d05e1682e6 (diff) |
RAID1: avoid unnecessary spin locks in I/O barrier code
When I run a parallel reading performan testing on a md raid1 device with
two NVMe SSDs, I observe very bad throughput in supprise: by fio with 64KB
block size, 40 seq read I/O jobs, 128 iodepth, overall throughput is
only 2.7GB/s, this is around 50% of the idea performance number.
The perf reports locking contention happens at allow_barrier() and
wait_barrier() code,
- 41.41% fio [kernel.kallsyms] [k] _raw_spin_lock_irqsave
- _raw_spin_lock_irqsave
+ 89.92% allow_barrier
+ 9.34% __wake_up
- 37.30% fio [kernel.kallsyms] [k] _raw_spin_lock_irq
- _raw_spin_lock_irq
- 100.00% wait_barrier
The reason is, in these I/O barrier related functions,
- raise_barrier()
- lower_barrier()
- wait_barrier()
- allow_barrier()
They always hold conf->resync_lock firstly, even there are only regular
reading I/Os and no resync I/O at all. This is a huge performance penalty.
The solution is a lockless-like algorithm in I/O barrier code, and only
holding conf->resync_lock when it has to.
The original idea is from Hannes Reinecke, and Neil Brown provides
comments to improve it. I continue to work on it, and make the patch into
current form.
In the new simpler raid1 I/O barrier implementation, there are two
wait barrier functions,
- wait_barrier()
Which calls _wait_barrier(), is used for regular write I/O. If there is
resync I/O happening on the same I/O barrier bucket, or the whole
array is frozen, task will wait until no barrier on same barrier bucket,
or the whold array is unfreezed.
- wait_read_barrier()
Since regular read I/O won't interfere with resync I/O (read_balance()
will make sure only uptodate data will be read out), it is unnecessary
to wait for barrier in regular read I/Os, waiting in only necessary
when the whole array is frozen.
The operations on conf->nr_pending[idx], conf->nr_waiting[idx], conf->
barrier[idx] are very carefully designed in raise_barrier(),
lower_barrier(), _wait_barrier() and wait_read_barrier(), in order to
avoid unnecessary spin locks in these functions. Once conf->
nr_pengding[idx] is increased, a resync I/O with same barrier bucket index
has to wait in raise_barrier(). Then in _wait_barrier() if no barrier
raised in same barrier bucket index and array is not frozen, the regular
I/O doesn't need to hold conf->resync_lock, it can just increase
conf->nr_pending[idx], and return to its caller. wait_read_barrier() is
very similar to _wait_barrier(), the only difference is it only waits when
array is frozen. For heavy parallel reading I/Os, the lockless I/O barrier
code almostly gets rid of all spin lock cost.
This patch significantly improves raid1 reading peroformance. From my
testing, a raid1 device built by two NVMe SSD, runs fio with 64KB
blocksize, 40 seq read I/O jobs, 128 iodepth, overall throughput
increases from 2.7GB/s to 4.6GB/s (+70%).
Changelog
V4:
- Change conf->nr_queued[] to atomic_t.
- Define BARRIER_BUCKETS_NR_BITS by (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
V3:
- Add smp_mb__after_atomic() as Shaohua and Neil suggested.
- Change conf->nr_queued[] from atomic_t to int.
- Change conf->array_frozen from atomic_t back to int, and use
READ_ONCE(conf->array_frozen) to check value of conf->array_frozen
in _wait_barrier() and wait_read_barrier().
- In _wait_barrier() and wait_read_barrier(), add a call to
wake_up(&conf->wait_barrier) after atomic_dec(&conf->nr_pending[idx]),
to fix a deadlock between _wait_barrier()/wait_read_barrier and
freeze_array().
V2:
- Remove a spin_lock/unlock pair in raid1d().
- Add more code comments to explain why there is no racy when checking two
atomic_t variables at same time.
V1:
- Original RFC patch for comments.
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Guoqing Jiang <gqjiang@suse.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 165 |
1 files changed, 114 insertions, 51 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40297fd17f7e..fefbbfdb440b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -226,7 +226,7 @@ static void reschedule_retry(struct r1bio *r1_bio) | |||
226 | idx = sector_to_idx(r1_bio->sector); | 226 | idx = sector_to_idx(r1_bio->sector); |
227 | spin_lock_irqsave(&conf->device_lock, flags); | 227 | spin_lock_irqsave(&conf->device_lock, flags); |
228 | list_add(&r1_bio->retry_list, &conf->retry_list); | 228 | list_add(&r1_bio->retry_list, &conf->retry_list); |
229 | conf->nr_queued[idx]++; | 229 | atomic_inc(&conf->nr_queued[idx]); |
230 | spin_unlock_irqrestore(&conf->device_lock, flags); | 230 | spin_unlock_irqrestore(&conf->device_lock, flags); |
231 | 231 | ||
232 | wake_up(&conf->wait_barrier); | 232 | wake_up(&conf->wait_barrier); |
@@ -836,11 +836,21 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) | |||
836 | spin_lock_irq(&conf->resync_lock); | 836 | spin_lock_irq(&conf->resync_lock); |
837 | 837 | ||
838 | /* Wait until no block IO is waiting */ | 838 | /* Wait until no block IO is waiting */ |
839 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx], | 839 | wait_event_lock_irq(conf->wait_barrier, |
840 | !atomic_read(&conf->nr_waiting[idx]), | ||
840 | conf->resync_lock); | 841 | conf->resync_lock); |
841 | 842 | ||
842 | /* block any new IO from starting */ | 843 | /* block any new IO from starting */ |
843 | conf->barrier[idx]++; | 844 | atomic_inc(&conf->barrier[idx]); |
845 | /* | ||
846 | * In raise_barrier() we firstly increase conf->barrier[idx] then | ||
847 | * check conf->nr_pending[idx]. In _wait_barrier() we firstly | ||
848 | * increase conf->nr_pending[idx] then check conf->barrier[idx]. | ||
849 | * A memory barrier here to make sure conf->nr_pending[idx] won't | ||
850 | * be fetched before conf->barrier[idx] is increased. Otherwise | ||
851 | * there will be a race between raise_barrier() and _wait_barrier(). | ||
852 | */ | ||
853 | smp_mb__after_atomic(); | ||
844 | 854 | ||
845 | /* For these conditions we must wait: | 855 | /* For these conditions we must wait: |
846 | * A: while the array is in frozen state | 856 | * A: while the array is in frozen state |
@@ -851,42 +861,81 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) | |||
851 | */ | 861 | */ |
852 | wait_event_lock_irq(conf->wait_barrier, | 862 | wait_event_lock_irq(conf->wait_barrier, |
853 | !conf->array_frozen && | 863 | !conf->array_frozen && |
854 | !conf->nr_pending[idx] && | 864 | !atomic_read(&conf->nr_pending[idx]) && |
855 | conf->barrier[idx] < RESYNC_DEPTH, | 865 | atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, |
856 | conf->resync_lock); | 866 | conf->resync_lock); |
857 | 867 | ||
858 | conf->nr_pending[idx]++; | 868 | atomic_inc(&conf->nr_pending[idx]); |
859 | spin_unlock_irq(&conf->resync_lock); | 869 | spin_unlock_irq(&conf->resync_lock); |
860 | } | 870 | } |
861 | 871 | ||
862 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr) | 872 | static void lower_barrier(struct r1conf *conf, sector_t sector_nr) |
863 | { | 873 | { |
864 | unsigned long flags; | ||
865 | int idx = sector_to_idx(sector_nr); | 874 | int idx = sector_to_idx(sector_nr); |
866 | 875 | ||
867 | BUG_ON(conf->barrier[idx] <= 0); | 876 | BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); |
868 | 877 | ||
869 | spin_lock_irqsave(&conf->resync_lock, flags); | 878 | atomic_dec(&conf->barrier[idx]); |
870 | conf->barrier[idx]--; | 879 | atomic_dec(&conf->nr_pending[idx]); |
871 | conf->nr_pending[idx]--; | ||
872 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
873 | wake_up(&conf->wait_barrier); | 880 | wake_up(&conf->wait_barrier); |
874 | } | 881 | } |
875 | 882 | ||
876 | static void _wait_barrier(struct r1conf *conf, int idx) | 883 | static void _wait_barrier(struct r1conf *conf, int idx) |
877 | { | 884 | { |
878 | spin_lock_irq(&conf->resync_lock); | 885 | /* |
879 | if (conf->array_frozen || conf->barrier[idx]) { | 886 | * We need to increase conf->nr_pending[idx] very early here, |
880 | conf->nr_waiting[idx]++; | 887 | * then raise_barrier() can be blocked when it waits for |
881 | /* Wait for the barrier to drop. */ | 888 | * conf->nr_pending[idx] to be 0. Then we can avoid holding |
882 | wait_event_lock_irq( | 889 | * conf->resync_lock when there is no barrier raised in same |
883 | conf->wait_barrier, | 890 | * barrier unit bucket. Also if the array is frozen, I/O |
884 | !conf->array_frozen && !conf->barrier[idx], | 891 | * should be blocked until array is unfrozen. |
885 | conf->resync_lock); | 892 | */ |
886 | conf->nr_waiting[idx]--; | 893 | atomic_inc(&conf->nr_pending[idx]); |
887 | } | 894 | /* |
895 | * In _wait_barrier() we firstly increase conf->nr_pending[idx], then | ||
896 | * check conf->barrier[idx]. In raise_barrier() we firstly increase | ||
897 | * conf->barrier[idx], then check conf->nr_pending[idx]. A memory | ||
898 | * barrier is necessary here to make sure conf->barrier[idx] won't be | ||
899 | * fetched before conf->nr_pending[idx] is increased. Otherwise there | ||
900 | * will be a race between _wait_barrier() and raise_barrier(). | ||
901 | */ | ||
902 | smp_mb__after_atomic(); | ||
903 | |||
904 | /* | ||
905 | * Don't worry about checking two atomic_t variables at same time | ||
906 | * here. If during we check conf->barrier[idx], the array is | ||
907 | * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is | ||
908 | * 0, it is safe to return and make the I/O continue. Because the | ||
909 | * array is frozen, all I/O returned here will eventually complete | ||
910 | * or be queued, no race will happen. See code comment in | ||
911 | * frozen_array(). | ||
912 | */ | ||
913 | if (!READ_ONCE(conf->array_frozen) && | ||
914 | !atomic_read(&conf->barrier[idx])) | ||
915 | return; | ||
888 | 916 | ||
889 | conf->nr_pending[idx]++; | 917 | /* |
918 | * After holding conf->resync_lock, conf->nr_pending[idx] | ||
919 | * should be decreased before waiting for barrier to drop. | ||
920 | * Otherwise, we may encounter a race condition because | ||
921 | * raise_barrer() might be waiting for conf->nr_pending[idx] | ||
922 | * to be 0 at same time. | ||
923 | */ | ||
924 | spin_lock_irq(&conf->resync_lock); | ||
925 | atomic_inc(&conf->nr_waiting[idx]); | ||
926 | atomic_dec(&conf->nr_pending[idx]); | ||
927 | /* | ||
928 | * In case freeze_array() is waiting for | ||
929 | * get_unqueued_pending() == extra | ||
930 | */ | ||
931 | wake_up(&conf->wait_barrier); | ||
932 | /* Wait for the barrier in same barrier unit bucket to drop. */ | ||
933 | wait_event_lock_irq(conf->wait_barrier, | ||
934 | !conf->array_frozen && | ||
935 | !atomic_read(&conf->barrier[idx]), | ||
936 | conf->resync_lock); | ||
937 | atomic_inc(&conf->nr_pending[idx]); | ||
938 | atomic_dec(&conf->nr_waiting[idx]); | ||
890 | spin_unlock_irq(&conf->resync_lock); | 939 | spin_unlock_irq(&conf->resync_lock); |
891 | } | 940 | } |
892 | 941 | ||
@@ -894,18 +943,32 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) | |||
894 | { | 943 | { |
895 | int idx = sector_to_idx(sector_nr); | 944 | int idx = sector_to_idx(sector_nr); |
896 | 945 | ||
897 | spin_lock_irq(&conf->resync_lock); | 946 | /* |
898 | if (conf->array_frozen) { | 947 | * Very similar to _wait_barrier(). The difference is, for read |
899 | conf->nr_waiting[idx]++; | 948 | * I/O we don't need wait for sync I/O, but if the whole array |
900 | /* Wait for array to unfreeze */ | 949 | * is frozen, the read I/O still has to wait until the array is |
901 | wait_event_lock_irq( | 950 | * unfrozen. Since there is no ordering requirement with |
902 | conf->wait_barrier, | 951 | * conf->barrier[idx] here, memory barrier is unnecessary as well. |
903 | !conf->array_frozen, | 952 | */ |
904 | conf->resync_lock); | 953 | atomic_inc(&conf->nr_pending[idx]); |
905 | conf->nr_waiting[idx]--; | ||
906 | } | ||
907 | 954 | ||
908 | conf->nr_pending[idx]++; | 955 | if (!READ_ONCE(conf->array_frozen)) |
956 | return; | ||
957 | |||
958 | spin_lock_irq(&conf->resync_lock); | ||
959 | atomic_inc(&conf->nr_waiting[idx]); | ||
960 | atomic_dec(&conf->nr_pending[idx]); | ||
961 | /* | ||
962 | * In case freeze_array() is waiting for | ||
963 | * get_unqueued_pending() == extra | ||
964 | */ | ||
965 | wake_up(&conf->wait_barrier); | ||
966 | /* Wait for array to be unfrozen */ | ||
967 | wait_event_lock_irq(conf->wait_barrier, | ||
968 | !conf->array_frozen, | ||
969 | conf->resync_lock); | ||
970 | atomic_inc(&conf->nr_pending[idx]); | ||
971 | atomic_dec(&conf->nr_waiting[idx]); | ||
909 | spin_unlock_irq(&conf->resync_lock); | 972 | spin_unlock_irq(&conf->resync_lock); |
910 | } | 973 | } |
911 | 974 | ||
@@ -926,11 +989,7 @@ static void wait_all_barriers(struct r1conf *conf) | |||
926 | 989 | ||
927 | static void _allow_barrier(struct r1conf *conf, int idx) | 990 | static void _allow_barrier(struct r1conf *conf, int idx) |
928 | { | 991 | { |
929 | unsigned long flags; | 992 | atomic_dec(&conf->nr_pending[idx]); |
930 | |||
931 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
932 | conf->nr_pending[idx]--; | ||
933 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
934 | wake_up(&conf->wait_barrier); | 993 | wake_up(&conf->wait_barrier); |
935 | } | 994 | } |
936 | 995 | ||
@@ -955,7 +1014,8 @@ static int get_unqueued_pending(struct r1conf *conf) | |||
955 | int idx, ret; | 1014 | int idx, ret; |
956 | 1015 | ||
957 | for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) | 1016 | for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) |
958 | ret += conf->nr_pending[idx] - conf->nr_queued[idx]; | 1017 | ret += atomic_read(&conf->nr_pending[idx]) - |
1018 | atomic_read(&conf->nr_queued[idx]); | ||
959 | 1019 | ||
960 | return ret; | 1020 | return ret; |
961 | } | 1021 | } |
@@ -1000,8 +1060,8 @@ static void unfreeze_array(struct r1conf *conf) | |||
1000 | /* reverse the effect of the freeze */ | 1060 | /* reverse the effect of the freeze */ |
1001 | spin_lock_irq(&conf->resync_lock); | 1061 | spin_lock_irq(&conf->resync_lock); |
1002 | conf->array_frozen = 0; | 1062 | conf->array_frozen = 0; |
1003 | wake_up(&conf->wait_barrier); | ||
1004 | spin_unlock_irq(&conf->resync_lock); | 1063 | spin_unlock_irq(&conf->resync_lock); |
1064 | wake_up(&conf->wait_barrier); | ||
1005 | } | 1065 | } |
1006 | 1066 | ||
1007 | /* duplicate the data pages for behind I/O | 1067 | /* duplicate the data pages for behind I/O |
@@ -2391,8 +2451,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | |||
2391 | spin_lock_irq(&conf->device_lock); | 2451 | spin_lock_irq(&conf->device_lock); |
2392 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); | 2452 | list_add(&r1_bio->retry_list, &conf->bio_end_io_list); |
2393 | idx = sector_to_idx(r1_bio->sector); | 2453 | idx = sector_to_idx(r1_bio->sector); |
2394 | conf->nr_queued[idx]++; | 2454 | atomic_inc(&conf->nr_queued[idx]); |
2395 | spin_unlock_irq(&conf->device_lock); | 2455 | spin_unlock_irq(&conf->device_lock); |
2456 | /* | ||
2457 | * In case freeze_array() is waiting for condition | ||
2458 | * get_unqueued_pending() == extra to be true. | ||
2459 | */ | ||
2460 | wake_up(&conf->wait_barrier); | ||
2396 | md_wakeup_thread(conf->mddev->thread); | 2461 | md_wakeup_thread(conf->mddev->thread); |
2397 | } else { | 2462 | } else { |
2398 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2463 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
@@ -2523,9 +2588,7 @@ static void raid1d(struct md_thread *thread) | |||
2523 | retry_list); | 2588 | retry_list); |
2524 | list_del(&r1_bio->retry_list); | 2589 | list_del(&r1_bio->retry_list); |
2525 | idx = sector_to_idx(r1_bio->sector); | 2590 | idx = sector_to_idx(r1_bio->sector); |
2526 | spin_lock_irqsave(&conf->device_lock, flags); | 2591 | atomic_dec(&conf->nr_queued[idx]); |
2527 | conf->nr_queued[idx]--; | ||
2528 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
2529 | if (mddev->degraded) | 2592 | if (mddev->degraded) |
2530 | set_bit(R1BIO_Degraded, &r1_bio->state); | 2593 | set_bit(R1BIO_Degraded, &r1_bio->state); |
2531 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | 2594 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
@@ -2547,7 +2610,7 @@ static void raid1d(struct md_thread *thread) | |||
2547 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); | 2610 | r1_bio = list_entry(head->prev, struct r1bio, retry_list); |
2548 | list_del(head->prev); | 2611 | list_del(head->prev); |
2549 | idx = sector_to_idx(r1_bio->sector); | 2612 | idx = sector_to_idx(r1_bio->sector); |
2550 | conf->nr_queued[idx]--; | 2613 | atomic_dec(&conf->nr_queued[idx]); |
2551 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2614 | spin_unlock_irqrestore(&conf->device_lock, flags); |
2552 | 2615 | ||
2553 | mddev = r1_bio->mddev; | 2616 | mddev = r1_bio->mddev; |
@@ -2664,7 +2727,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2664 | * If there is non-resync activity waiting for a turn, then let it | 2727 | * If there is non-resync activity waiting for a turn, then let it |
2665 | * though before starting on this new sync request. | 2728 | * though before starting on this new sync request. |
2666 | */ | 2729 | */ |
2667 | if (conf->nr_waiting[idx]) | 2730 | if (atomic_read(&conf->nr_waiting[idx])) |
2668 | schedule_timeout_uninterruptible(1); | 2731 | schedule_timeout_uninterruptible(1); |
2669 | 2732 | ||
2670 | /* we are incrementing sector_nr below. To be safe, we check against | 2733 | /* we are incrementing sector_nr below. To be safe, we check against |
@@ -2924,22 +2987,22 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2924 | goto abort; | 2987 | goto abort; |
2925 | 2988 | ||
2926 | conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, | 2989 | conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, |
2927 | sizeof(int), GFP_KERNEL); | 2990 | sizeof(atomic_t), GFP_KERNEL); |
2928 | if (!conf->nr_pending) | 2991 | if (!conf->nr_pending) |
2929 | goto abort; | 2992 | goto abort; |
2930 | 2993 | ||
2931 | conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, | 2994 | conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, |
2932 | sizeof(int), GFP_KERNEL); | 2995 | sizeof(atomic_t), GFP_KERNEL); |
2933 | if (!conf->nr_waiting) | 2996 | if (!conf->nr_waiting) |
2934 | goto abort; | 2997 | goto abort; |
2935 | 2998 | ||
2936 | conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, | 2999 | conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, |
2937 | sizeof(int), GFP_KERNEL); | 3000 | sizeof(atomic_t), GFP_KERNEL); |
2938 | if (!conf->nr_queued) | 3001 | if (!conf->nr_queued) |
2939 | goto abort; | 3002 | goto abort; |
2940 | 3003 | ||
2941 | conf->barrier = kcalloc(BARRIER_BUCKETS_NR, | 3004 | conf->barrier = kcalloc(BARRIER_BUCKETS_NR, |
2942 | sizeof(int), GFP_KERNEL); | 3005 | sizeof(atomic_t), GFP_KERNEL); |
2943 | if (!conf->barrier) | 3006 | if (!conf->barrier) |
2944 | goto abort; | 3007 | goto abort; |
2945 | 3008 | ||