aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuanhan Liu <yuanhan.liu@linux.intel.com>2015-05-08 04:19:07 -0400
committerNeilBrown <neilb@suse.de>2015-06-16 20:00:27 -0400
commite9e4c377e2f563892c50d1d093dd55c7d518fc3d (patch)
treed870f80f5a8f7569a102ef474d8931126c652540
parentb1b4648648e18775082858eca2517322f63e57a1 (diff)
md/raid5: per hash value and exclusive wait_for_stripe
I noticed heavy spin lock contention at get_active_stripe() with fsmark multiple thread write workloads. Here is how this hot contention comes from. We have limited stripes, and it's a multiple thread write workload. Hence, those stripes will be taken soon, which puts later processes to sleep for waiting free stripes. When enough stripes(>= 1/4 total stripes) are released, all process are woken, trying to get the lock. But there is one only being able to get this lock for each hash lock, making other processes spinning out there for acquiring the lock. Thus, it's effectiveless to wakeup all processes and let them battle for a lock that permits one to access only each time. Instead, we could make it be a exclusive wake up: wake up one process only. That avoids the heavy spin lock contention naturally. To do the exclusive wake up, we've to split wait_for_stripe into multiple wait queues, to make it per hash value, just like the hash lock. Here are some test results I have got with this patch applied(all test run 3 times): `fsmark.files_per_sec' ===================== next-20150317 this patch ------------------------- ------------------------- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params ------------------------- ------------------------- -------- ------------------------------ 25.600 ±0.0 92.700 ±2.5 262.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 25.600 ±0.0 77.800 ±0.6 203.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 32.000 ±0.0 93.800 ±1.7 193.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 32.000 ±0.0 81.233 ±1.7 153.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 48.800 ±14.5 99.667 ±2.0 104.2% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 6.400 ±0.0 12.800 ±0.0 100.0% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 63.133 ±8.2 82.800 ±0.7 31.2% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 245.067 ±0.7 306.567 ±7.9 25.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose 17.533 ±0.3 21.000 ±0.8 19.8% ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose 188.167 ±1.9 215.033 ±3.1 14.3% ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync 254.500 ±1.8 290.733 ±2.4 14.2% ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync `time.system_time' ===================== next-20150317 this patch ------------------------- ------------------------- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params ------------------------- ------------------------- -------- ------------------------------ 7235.603 ±1.2 185.163 ±1.9 -97.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 7666.883 ±2.9 202.750 ±1.0 -97.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 14567.893 ±0.7 421.230 ±0.4 -97.1% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 3697.667 ±14.0 148.190 ±1.7 -96.0% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 5572.867 ±3.8 310.717 ±1.4 -94.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 5565.050 ±0.5 313.277 ±1.5 -94.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 2420.707 ±17.1 171.043 ±2.7 -92.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 3743.300 ±4.6 379.827 ±3.5 -89.9% ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose 3308.687 ±6.3 363.050 ±2.0 -89.0% ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose Where, 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So, it would be 48G in total. And we made a raid on those ramdisk As you can see, though there are no much performance gain for hard disk workload, the system time is dropped heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). v2: use bits instead of array to note down wait queue need to wake up. Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c27
-rw-r--r--drivers/md/raid5.h2
2 files changed, 20 insertions, 9 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a9112b39afee..9a3b143b0b68 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf,
344 int hash) 344 int hash)
345{ 345{
346 int size; 346 int size;
347 bool do_wakeup = false; 347 unsigned long do_wakeup = 0;
348 int i = 0;
348 unsigned long flags; 349 unsigned long flags;
349 350
350 if (hash == NR_STRIPE_HASH_LOCKS) { 351 if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf,
365 !list_empty(list)) 366 !list_empty(list))
366 atomic_dec(&conf->empty_inactive_list_nr); 367 atomic_dec(&conf->empty_inactive_list_nr);
367 list_splice_tail_init(list, conf->inactive_list + hash); 368 list_splice_tail_init(list, conf->inactive_list + hash);
368 do_wakeup = true; 369 do_wakeup |= 1 << hash;
369 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 370 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
370 } 371 }
371 size--; 372 size--;
372 hash--; 373 hash--;
373 } 374 }
374 375
376 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
377 if (do_wakeup & (1 << i))
378 wake_up(&conf->wait_for_stripe[i]);
379 }
380
375 if (do_wakeup) { 381 if (do_wakeup) {
376 wake_up(&conf->wait_for_stripe);
377 if (atomic_read(&conf->active_stripes) == 0) 382 if (atomic_read(&conf->active_stripes) == 0)
378 wake_up(&conf->wait_for_quiescent); 383 wake_up(&conf->wait_for_quiescent);
379 if (conf->retry_read_aligned) 384 if (conf->retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
686 if (!sh) { 691 if (!sh) {
687 set_bit(R5_INACTIVE_BLOCKED, 692 set_bit(R5_INACTIVE_BLOCKED,
688 &conf->cache_state); 693 &conf->cache_state);
689 wait_event_lock_irq( 694 wait_event_exclusive_cmd(
690 conf->wait_for_stripe, 695 conf->wait_for_stripe[hash],
691 !list_empty(conf->inactive_list + hash) && 696 !list_empty(conf->inactive_list + hash) &&
692 (atomic_read(&conf->active_stripes) 697 (atomic_read(&conf->active_stripes)
693 < (conf->max_nr_stripes * 3 / 4) 698 < (conf->max_nr_stripes * 3 / 4)
694 || !test_bit(R5_INACTIVE_BLOCKED, 699 || !test_bit(R5_INACTIVE_BLOCKED,
695 &conf->cache_state)), 700 &conf->cache_state)),
696 *(conf->hash_locks + hash)); 701 spin_unlock_irq(conf->hash_locks + hash),
702 spin_lock_irq(conf->hash_locks + hash));
697 clear_bit(R5_INACTIVE_BLOCKED, 703 clear_bit(R5_INACTIVE_BLOCKED,
698 &conf->cache_state); 704 &conf->cache_state);
699 } else { 705 } else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
718 } 724 }
719 } while (sh == NULL); 725 } while (sh == NULL);
720 726
727 if (!list_empty(conf->inactive_list + hash))
728 wake_up(&conf->wait_for_stripe[hash]);
729
721 spin_unlock_irq(conf->hash_locks + hash); 730 spin_unlock_irq(conf->hash_locks + hash);
722 return sh; 731 return sh;
723} 732}
@@ -2179,7 +2188,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
2179 cnt = 0; 2188 cnt = 0;
2180 list_for_each_entry(nsh, &newstripes, lru) { 2189 list_for_each_entry(nsh, &newstripes, lru) {
2181 lock_device_hash_lock(conf, hash); 2190 lock_device_hash_lock(conf, hash);
2182 wait_event_cmd(conf->wait_for_stripe, 2191 wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
2183 !list_empty(conf->inactive_list + hash), 2192 !list_empty(conf->inactive_list + hash),
2184 unlock_device_hash_lock(conf, hash), 2193 unlock_device_hash_lock(conf, hash),
2185 lock_device_hash_lock(conf, hash)); 2194 lock_device_hash_lock(conf, hash));
@@ -6436,7 +6445,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6436 spin_lock_init(&conf->device_lock); 6445 spin_lock_init(&conf->device_lock);
6437 seqcount_init(&conf->gen_lock); 6446 seqcount_init(&conf->gen_lock);
6438 init_waitqueue_head(&conf->wait_for_quiescent); 6447 init_waitqueue_head(&conf->wait_for_quiescent);
6439 init_waitqueue_head(&conf->wait_for_stripe); 6448 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
6449 init_waitqueue_head(&conf->wait_for_stripe[i]);
6450 }
6440 init_waitqueue_head(&conf->wait_for_overlap); 6451 init_waitqueue_head(&conf->wait_for_overlap);
6441 INIT_LIST_HEAD(&conf->handle_list); 6452 INIT_LIST_HEAD(&conf->handle_list);
6442 INIT_LIST_HEAD(&conf->hold_list); 6453 INIT_LIST_HEAD(&conf->hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9b84b8820fc5..02c3bf8fbfe7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -512,7 +512,7 @@ struct r5conf {
512 atomic_t empty_inactive_list_nr; 512 atomic_t empty_inactive_list_nr;
513 struct llist_head released_stripes; 513 struct llist_head released_stripes;
514 wait_queue_head_t wait_for_quiescent; 514 wait_queue_head_t wait_for_quiescent;
515 wait_queue_head_t wait_for_stripe; 515 wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS];
516 wait_queue_head_t wait_for_overlap; 516 wait_queue_head_t wait_for_overlap;
517 unsigned long cache_state; 517 unsigned long cache_state;
518#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, 518#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,