summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorcolyli@suse.de <colyli@suse.de>2017-02-17 14:05:57 -0500
committerShaohua Li <shli@fb.com>2017-02-20 01:04:25 -0500
commit824e47daddbfc6ebe1006b8659f080620472a136 (patch)
treeb4a3076a35b2d13079349d82fe0203c9eb8879a2 /drivers/md/raid1.c
parentfd76863e37fef26fe05547fddfa6e3d05e1682e6 (diff)
RAID1: avoid unnecessary spin locks in I/O barrier code
When I run a parallel reading performan testing on a md raid1 device with two NVMe SSDs, I observe very bad throughput in supprise: by fio with 64KB block size, 40 seq read I/O jobs, 128 iodepth, overall throughput is only 2.7GB/s, this is around 50% of the idea performance number. The perf reports locking contention happens at allow_barrier() and wait_barrier() code, - 41.41% fio [kernel.kallsyms] [k] _raw_spin_lock_irqsave - _raw_spin_lock_irqsave + 89.92% allow_barrier + 9.34% __wake_up - 37.30% fio [kernel.kallsyms] [k] _raw_spin_lock_irq - _raw_spin_lock_irq - 100.00% wait_barrier The reason is, in these I/O barrier related functions, - raise_barrier() - lower_barrier() - wait_barrier() - allow_barrier() They always hold conf->resync_lock firstly, even there are only regular reading I/Os and no resync I/O at all. This is a huge performance penalty. The solution is a lockless-like algorithm in I/O barrier code, and only holding conf->resync_lock when it has to. The original idea is from Hannes Reinecke, and Neil Brown provides comments to improve it. I continue to work on it, and make the patch into current form. In the new simpler raid1 I/O barrier implementation, there are two wait barrier functions, - wait_barrier() Which calls _wait_barrier(), is used for regular write I/O. If there is resync I/O happening on the same I/O barrier bucket, or the whole array is frozen, task will wait until no barrier on same barrier bucket, or the whold array is unfreezed. - wait_read_barrier() Since regular read I/O won't interfere with resync I/O (read_balance() will make sure only uptodate data will be read out), it is unnecessary to wait for barrier in regular read I/Os, waiting in only necessary when the whole array is frozen. The operations on conf->nr_pending[idx], conf->nr_waiting[idx], conf-> barrier[idx] are very carefully designed in raise_barrier(), lower_barrier(), _wait_barrier() and wait_read_barrier(), in order to avoid unnecessary spin locks in these functions. Once conf-> nr_pengding[idx] is increased, a resync I/O with same barrier bucket index has to wait in raise_barrier(). Then in _wait_barrier() if no barrier raised in same barrier bucket index and array is not frozen, the regular I/O doesn't need to hold conf->resync_lock, it can just increase conf->nr_pending[idx], and return to its caller. wait_read_barrier() is very similar to _wait_barrier(), the only difference is it only waits when array is frozen. For heavy parallel reading I/Os, the lockless I/O barrier code almostly gets rid of all spin lock cost. This patch significantly improves raid1 reading peroformance. From my testing, a raid1 device built by two NVMe SSD, runs fio with 64KB blocksize, 40 seq read I/O jobs, 128 iodepth, overall throughput increases from 2.7GB/s to 4.6GB/s (+70%). Changelog V4: - Change conf->nr_queued[] to atomic_t. - Define BARRIER_BUCKETS_NR_BITS by (PAGE_SHIFT - ilog2(sizeof(atomic_t))) V3: - Add smp_mb__after_atomic() as Shaohua and Neil suggested. - Change conf->nr_queued[] from atomic_t to int. - Change conf->array_frozen from atomic_t back to int, and use READ_ONCE(conf->array_frozen) to check value of conf->array_frozen in _wait_barrier() and wait_read_barrier(). - In _wait_barrier() and wait_read_barrier(), add a call to wake_up(&conf->wait_barrier) after atomic_dec(&conf->nr_pending[idx]), to fix a deadlock between _wait_barrier()/wait_read_barrier and freeze_array(). V2: - Remove a spin_lock/unlock pair in raid1d(). - Add more code comments to explain why there is no racy when checking two atomic_t variables at same time. V1: - Original RFC patch for comments. Signed-off-by: Coly Li <colyli@suse.de> Cc: Shaohua Li <shli@fb.com> Cc: Hannes Reinecke <hare@suse.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: Guoqing Jiang <gqjiang@suse.com> Reviewed-by: Neil Brown <neilb@suse.de> Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c165
1 files changed, 114 insertions, 51 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 40297fd17f7e..fefbbfdb440b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -226,7 +226,7 @@ static void reschedule_retry(struct r1bio *r1_bio)
226 idx = sector_to_idx(r1_bio->sector); 226 idx = sector_to_idx(r1_bio->sector);
227 spin_lock_irqsave(&conf->device_lock, flags); 227 spin_lock_irqsave(&conf->device_lock, flags);
228 list_add(&r1_bio->retry_list, &conf->retry_list); 228 list_add(&r1_bio->retry_list, &conf->retry_list);
229 conf->nr_queued[idx]++; 229 atomic_inc(&conf->nr_queued[idx]);
230 spin_unlock_irqrestore(&conf->device_lock, flags); 230 spin_unlock_irqrestore(&conf->device_lock, flags);
231 231
232 wake_up(&conf->wait_barrier); 232 wake_up(&conf->wait_barrier);
@@ -836,11 +836,21 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
836 spin_lock_irq(&conf->resync_lock); 836 spin_lock_irq(&conf->resync_lock);
837 837
838 /* Wait until no block IO is waiting */ 838 /* Wait until no block IO is waiting */
839 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx], 839 wait_event_lock_irq(conf->wait_barrier,
840 !atomic_read(&conf->nr_waiting[idx]),
840 conf->resync_lock); 841 conf->resync_lock);
841 842
842 /* block any new IO from starting */ 843 /* block any new IO from starting */
843 conf->barrier[idx]++; 844 atomic_inc(&conf->barrier[idx]);
845 /*
846 * In raise_barrier() we firstly increase conf->barrier[idx] then
847 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
848 * increase conf->nr_pending[idx] then check conf->barrier[idx].
849 * A memory barrier here to make sure conf->nr_pending[idx] won't
850 * be fetched before conf->barrier[idx] is increased. Otherwise
851 * there will be a race between raise_barrier() and _wait_barrier().
852 */
853 smp_mb__after_atomic();
844 854
845 /* For these conditions we must wait: 855 /* For these conditions we must wait:
846 * A: while the array is in frozen state 856 * A: while the array is in frozen state
@@ -851,42 +861,81 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
851 */ 861 */
852 wait_event_lock_irq(conf->wait_barrier, 862 wait_event_lock_irq(conf->wait_barrier,
853 !conf->array_frozen && 863 !conf->array_frozen &&
854 !conf->nr_pending[idx] && 864 !atomic_read(&conf->nr_pending[idx]) &&
855 conf->barrier[idx] < RESYNC_DEPTH, 865 atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
856 conf->resync_lock); 866 conf->resync_lock);
857 867
858 conf->nr_pending[idx]++; 868 atomic_inc(&conf->nr_pending[idx]);
859 spin_unlock_irq(&conf->resync_lock); 869 spin_unlock_irq(&conf->resync_lock);
860} 870}
861 871
862static void lower_barrier(struct r1conf *conf, sector_t sector_nr) 872static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
863{ 873{
864 unsigned long flags;
865 int idx = sector_to_idx(sector_nr); 874 int idx = sector_to_idx(sector_nr);
866 875
867 BUG_ON(conf->barrier[idx] <= 0); 876 BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
868 877
869 spin_lock_irqsave(&conf->resync_lock, flags); 878 atomic_dec(&conf->barrier[idx]);
870 conf->barrier[idx]--; 879 atomic_dec(&conf->nr_pending[idx]);
871 conf->nr_pending[idx]--;
872 spin_unlock_irqrestore(&conf->resync_lock, flags);
873 wake_up(&conf->wait_barrier); 880 wake_up(&conf->wait_barrier);
874} 881}
875 882
876static void _wait_barrier(struct r1conf *conf, int idx) 883static void _wait_barrier(struct r1conf *conf, int idx)
877{ 884{
878 spin_lock_irq(&conf->resync_lock); 885 /*
879 if (conf->array_frozen || conf->barrier[idx]) { 886 * We need to increase conf->nr_pending[idx] very early here,
880 conf->nr_waiting[idx]++; 887 * then raise_barrier() can be blocked when it waits for
881 /* Wait for the barrier to drop. */ 888 * conf->nr_pending[idx] to be 0. Then we can avoid holding
882 wait_event_lock_irq( 889 * conf->resync_lock when there is no barrier raised in same
883 conf->wait_barrier, 890 * barrier unit bucket. Also if the array is frozen, I/O
884 !conf->array_frozen && !conf->barrier[idx], 891 * should be blocked until array is unfrozen.
885 conf->resync_lock); 892 */
886 conf->nr_waiting[idx]--; 893 atomic_inc(&conf->nr_pending[idx]);
887 } 894 /*
895 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
896 * check conf->barrier[idx]. In raise_barrier() we firstly increase
897 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
898 * barrier is necessary here to make sure conf->barrier[idx] won't be
899 * fetched before conf->nr_pending[idx] is increased. Otherwise there
900 * will be a race between _wait_barrier() and raise_barrier().
901 */
902 smp_mb__after_atomic();
903
904 /*
905 * Don't worry about checking two atomic_t variables at same time
906 * here. If during we check conf->barrier[idx], the array is
907 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
908 * 0, it is safe to return and make the I/O continue. Because the
909 * array is frozen, all I/O returned here will eventually complete
910 * or be queued, no race will happen. See code comment in
911 * frozen_array().
912 */
913 if (!READ_ONCE(conf->array_frozen) &&
914 !atomic_read(&conf->barrier[idx]))
915 return;
888 916
889 conf->nr_pending[idx]++; 917 /*
918 * After holding conf->resync_lock, conf->nr_pending[idx]
919 * should be decreased before waiting for barrier to drop.
920 * Otherwise, we may encounter a race condition because
921 * raise_barrer() might be waiting for conf->nr_pending[idx]
922 * to be 0 at same time.
923 */
924 spin_lock_irq(&conf->resync_lock);
925 atomic_inc(&conf->nr_waiting[idx]);
926 atomic_dec(&conf->nr_pending[idx]);
927 /*
928 * In case freeze_array() is waiting for
929 * get_unqueued_pending() == extra
930 */
931 wake_up(&conf->wait_barrier);
932 /* Wait for the barrier in same barrier unit bucket to drop. */
933 wait_event_lock_irq(conf->wait_barrier,
934 !conf->array_frozen &&
935 !atomic_read(&conf->barrier[idx]),
936 conf->resync_lock);
937 atomic_inc(&conf->nr_pending[idx]);
938 atomic_dec(&conf->nr_waiting[idx]);
890 spin_unlock_irq(&conf->resync_lock); 939 spin_unlock_irq(&conf->resync_lock);
891} 940}
892 941
@@ -894,18 +943,32 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
894{ 943{
895 int idx = sector_to_idx(sector_nr); 944 int idx = sector_to_idx(sector_nr);
896 945
897 spin_lock_irq(&conf->resync_lock); 946 /*
898 if (conf->array_frozen) { 947 * Very similar to _wait_barrier(). The difference is, for read
899 conf->nr_waiting[idx]++; 948 * I/O we don't need wait for sync I/O, but if the whole array
900 /* Wait for array to unfreeze */ 949 * is frozen, the read I/O still has to wait until the array is
901 wait_event_lock_irq( 950 * unfrozen. Since there is no ordering requirement with
902 conf->wait_barrier, 951 * conf->barrier[idx] here, memory barrier is unnecessary as well.
903 !conf->array_frozen, 952 */
904 conf->resync_lock); 953 atomic_inc(&conf->nr_pending[idx]);
905 conf->nr_waiting[idx]--;
906 }
907 954
908 conf->nr_pending[idx]++; 955 if (!READ_ONCE(conf->array_frozen))
956 return;
957
958 spin_lock_irq(&conf->resync_lock);
959 atomic_inc(&conf->nr_waiting[idx]);
960 atomic_dec(&conf->nr_pending[idx]);
961 /*
962 * In case freeze_array() is waiting for
963 * get_unqueued_pending() == extra
964 */
965 wake_up(&conf->wait_barrier);
966 /* Wait for array to be unfrozen */
967 wait_event_lock_irq(conf->wait_barrier,
968 !conf->array_frozen,
969 conf->resync_lock);
970 atomic_inc(&conf->nr_pending[idx]);
971 atomic_dec(&conf->nr_waiting[idx]);
909 spin_unlock_irq(&conf->resync_lock); 972 spin_unlock_irq(&conf->resync_lock);
910} 973}
911 974
@@ -926,11 +989,7 @@ static void wait_all_barriers(struct r1conf *conf)
926 989
927static void _allow_barrier(struct r1conf *conf, int idx) 990static void _allow_barrier(struct r1conf *conf, int idx)
928{ 991{
929 unsigned long flags; 992 atomic_dec(&conf->nr_pending[idx]);
930
931 spin_lock_irqsave(&conf->resync_lock, flags);
932 conf->nr_pending[idx]--;
933 spin_unlock_irqrestore(&conf->resync_lock, flags);
934 wake_up(&conf->wait_barrier); 993 wake_up(&conf->wait_barrier);
935} 994}
936 995
@@ -955,7 +1014,8 @@ static int get_unqueued_pending(struct r1conf *conf)
955 int idx, ret; 1014 int idx, ret;
956 1015
957 for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) 1016 for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
958 ret += conf->nr_pending[idx] - conf->nr_queued[idx]; 1017 ret += atomic_read(&conf->nr_pending[idx]) -
1018 atomic_read(&conf->nr_queued[idx]);
959 1019
960 return ret; 1020 return ret;
961} 1021}
@@ -1000,8 +1060,8 @@ static void unfreeze_array(struct r1conf *conf)
1000 /* reverse the effect of the freeze */ 1060 /* reverse the effect of the freeze */
1001 spin_lock_irq(&conf->resync_lock); 1061 spin_lock_irq(&conf->resync_lock);
1002 conf->array_frozen = 0; 1062 conf->array_frozen = 0;
1003 wake_up(&conf->wait_barrier);
1004 spin_unlock_irq(&conf->resync_lock); 1063 spin_unlock_irq(&conf->resync_lock);
1064 wake_up(&conf->wait_barrier);
1005} 1065}
1006 1066
1007/* duplicate the data pages for behind I/O 1067/* duplicate the data pages for behind I/O
@@ -2391,8 +2451,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2391 spin_lock_irq(&conf->device_lock); 2451 spin_lock_irq(&conf->device_lock);
2392 list_add(&r1_bio->retry_list, &conf->bio_end_io_list); 2452 list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
2393 idx = sector_to_idx(r1_bio->sector); 2453 idx = sector_to_idx(r1_bio->sector);
2394 conf->nr_queued[idx]++; 2454 atomic_inc(&conf->nr_queued[idx]);
2395 spin_unlock_irq(&conf->device_lock); 2455 spin_unlock_irq(&conf->device_lock);
2456 /*
2457 * In case freeze_array() is waiting for condition
2458 * get_unqueued_pending() == extra to be true.
2459 */
2460 wake_up(&conf->wait_barrier);
2396 md_wakeup_thread(conf->mddev->thread); 2461 md_wakeup_thread(conf->mddev->thread);
2397 } else { 2462 } else {
2398 if (test_bit(R1BIO_WriteError, &r1_bio->state)) 2463 if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2523,9 +2588,7 @@ static void raid1d(struct md_thread *thread)
2523 retry_list); 2588 retry_list);
2524 list_del(&r1_bio->retry_list); 2589 list_del(&r1_bio->retry_list);
2525 idx = sector_to_idx(r1_bio->sector); 2590 idx = sector_to_idx(r1_bio->sector);
2526 spin_lock_irqsave(&conf->device_lock, flags); 2591 atomic_dec(&conf->nr_queued[idx]);
2527 conf->nr_queued[idx]--;
2528 spin_unlock_irqrestore(&conf->device_lock, flags);
2529 if (mddev->degraded) 2592 if (mddev->degraded)
2530 set_bit(R1BIO_Degraded, &r1_bio->state); 2593 set_bit(R1BIO_Degraded, &r1_bio->state);
2531 if (test_bit(R1BIO_WriteError, &r1_bio->state)) 2594 if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2547,7 +2610,7 @@ static void raid1d(struct md_thread *thread)
2547 r1_bio = list_entry(head->prev, struct r1bio, retry_list); 2610 r1_bio = list_entry(head->prev, struct r1bio, retry_list);
2548 list_del(head->prev); 2611 list_del(head->prev);
2549 idx = sector_to_idx(r1_bio->sector); 2612 idx = sector_to_idx(r1_bio->sector);
2550 conf->nr_queued[idx]--; 2613 atomic_dec(&conf->nr_queued[idx]);
2551 spin_unlock_irqrestore(&conf->device_lock, flags); 2614 spin_unlock_irqrestore(&conf->device_lock, flags);
2552 2615
2553 mddev = r1_bio->mddev; 2616 mddev = r1_bio->mddev;
@@ -2664,7 +2727,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2664 * If there is non-resync activity waiting for a turn, then let it 2727 * If there is non-resync activity waiting for a turn, then let it
2665 * though before starting on this new sync request. 2728 * though before starting on this new sync request.
2666 */ 2729 */
2667 if (conf->nr_waiting[idx]) 2730 if (atomic_read(&conf->nr_waiting[idx]))
2668 schedule_timeout_uninterruptible(1); 2731 schedule_timeout_uninterruptible(1);
2669 2732
2670 /* we are incrementing sector_nr below. To be safe, we check against 2733 /* we are incrementing sector_nr below. To be safe, we check against
@@ -2924,22 +2987,22 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2924 goto abort; 2987 goto abort;
2925 2988
2926 conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, 2989 conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
2927 sizeof(int), GFP_KERNEL); 2990 sizeof(atomic_t), GFP_KERNEL);
2928 if (!conf->nr_pending) 2991 if (!conf->nr_pending)
2929 goto abort; 2992 goto abort;
2930 2993
2931 conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, 2994 conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
2932 sizeof(int), GFP_KERNEL); 2995 sizeof(atomic_t), GFP_KERNEL);
2933 if (!conf->nr_waiting) 2996 if (!conf->nr_waiting)
2934 goto abort; 2997 goto abort;
2935 2998
2936 conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, 2999 conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
2937 sizeof(int), GFP_KERNEL); 3000 sizeof(atomic_t), GFP_KERNEL);
2938 if (!conf->nr_queued) 3001 if (!conf->nr_queued)
2939 goto abort; 3002 goto abort;
2940 3003
2941 conf->barrier = kcalloc(BARRIER_BUCKETS_NR, 3004 conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
2942 sizeof(int), GFP_KERNEL); 3005 sizeof(atomic_t), GFP_KERNEL);
2943 if (!conf->barrier) 3006 if (!conf->barrier)
2944 goto abort; 3007 goto abort;
2945 3008