aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2008-04-28 05:15:53 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:42 -0400
commit8b3e6cdc53b7f29f7026955d6cb6902a49322a15 (patch)
tree69d9f22a526e687fe2892d327caa3fa6ddd65cf6
parente46b272b6608783ed7aa7b0594871550ce20b849 (diff)
md: introduce get_priority_stripe() to improve raid456 write performance
Improve write performance by preventing the delayed_list from dumping all its stripes onto the handle_list in one shot. Delayed stripes are now further delayed by being held on the 'hold_list'. The 'hold_list' is bypassed when: * a STRIPE_IO_STARTED stripe is found at the head of 'handle_list' * 'handle_list' is empty and i/o is being done to satisfy full stripe-width write requests * 'bypass_count' is less than 'bypass_threshold'. By default the threshold is 1, i.e. every other stripe handled is a preread stripe provided the top two conditions are false. Benchmark data: System: 2x Xeon 5150, 4x SATA, mem=1GB Baseline: 2.6.24-rc7 Configuration: mdadm --create /dev/md0 /dev/sd[b-e] -n 4 -l 5 --assume-clean Test1: dd if=/dev/zero of=/dev/md0 bs=1024k count=2048 * patched: +33% (stripe_cache_size = 256), +25% (stripe_cache_size = 512) Test2: tiobench --size 2048 --numruns 5 --block 4096 --block 131072 (XFS) * patched: +13% * patched + preread_bypass_threshold = 0: +37% Changes since v1: * reduce bypass_threshold from (chunk_size / sectors_per_chunk) to (1) and make it configurable. This defaults to fairness and modest performance gains out of the box. Changes since v2: * [neilb@suse.de]: kill STRIPE_PRIO_HI and preread_needed as they are not necessary, the important change was clearing STRIPE_DELAYED in add_stripe_bio and this has been moved out to make_request for the hang fix. * [neilb@suse.de]: simplify get_priority_stripe * [dan.j.williams@intel.com]: reset the bypass_count when ->hold_list is sampled empty (+11%) * [dan.j.williams@intel.com]: decrement the bypass_count at the detection of stripes being naturally promoted off of hold_list +2%. Note, resetting bypass_count instead of decrementing on these events yields +4% but that is probably too aggressive. Changes since v3: * cosmetic fixups Tested-by: James W. Laferriere <babydr@baby-dragons.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/md.txt6
-rw-r--r--drivers/md/raid5.c122
-rw-r--r--include/linux/raid/raid5.h7
3 files changed, 125 insertions, 10 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 396cdd982c26..a8b430627473 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -450,3 +450,9 @@ These currently include
450 there are upper and lower limits (32768, 16). Default is 128. 450 there are upper and lower limits (32768, 16). Default is 128.
451 strip_cache_active (currently raid5 only) 451 strip_cache_active (currently raid5 only)
452 number of active entries in the stripe cache 452 number of active entries in the stripe cache
453 preread_bypass_threshold (currently raid5 only)
454 number of times a stripe requiring preread will be bypassed by
455 a stripe that does not require preread. For fairness defaults
456 to 1. Setting this to 0 disables bypass accounting and
457 requires preread stripes to wait until all full-width stripe-
458 writes are complete. Valid values are 0 to stripe_cache_size.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4efec467e2f1..45eead608647 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
63#define STRIPE_SHIFT (PAGE_SHIFT - 9) 63#define STRIPE_SHIFT (PAGE_SHIFT - 9)
64#define STRIPE_SECTORS (STRIPE_SIZE>>9) 64#define STRIPE_SECTORS (STRIPE_SIZE>>9)
65#define IO_THRESHOLD 1 65#define IO_THRESHOLD 1
66#define BYPASS_THRESHOLD 1
66#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 67#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
67#define HASH_MASK (NR_HASH - 1) 68#define HASH_MASK (NR_HASH - 1)
68 69
@@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh)
398 399
399 might_sleep(); 400 might_sleep();
400 401
402 set_bit(STRIPE_IO_STARTED, &sh->state);
401 for (i = disks; i--; ) { 403 for (i = disks; i--; ) {
402 int rw; 404 int rw;
403 struct bio *bi; 405 struct bio *bi;
@@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1720 locked++; 1722 locked++;
1721 } 1723 }
1722 } 1724 }
1725 if (locked + 1 == disks)
1726 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1727 atomic_inc(&sh->raid_conf->pending_full_writes);
1723 } else { 1728 } else {
1724 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1729 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1725 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1730 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1947 STRIPE_SECTORS, 0, 0); 1952 STRIPE_SECTORS, 0, 0);
1948 } 1953 }
1949 1954
1955 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1956 if (atomic_dec_and_test(&conf->pending_full_writes))
1957 md_wakeup_thread(conf->mddev->thread);
1950} 1958}
1951 1959
1952/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1960/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2149 0); 2157 0);
2150 } 2158 }
2151 } 2159 }
2160
2161 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2162 if (atomic_dec_and_test(&conf->pending_full_writes))
2163 md_wakeup_thread(conf->mddev->thread);
2152} 2164}
2153 2165
2154static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2166static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2333 s->locked++; 2345 s->locked++;
2334 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2346 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2335 } 2347 }
2348 if (s->locked == disks)
2349 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2350 atomic_inc(&conf->pending_full_writes);
2336 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ 2351 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2337 set_bit(STRIPE_INSYNC, &sh->state); 2352 set_bit(STRIPE_INSYNC, &sh->state);
2338 2353
@@ -3094,6 +3109,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3094 else 3109 else
3095 continue; 3110 continue;
3096 3111
3112 set_bit(STRIPE_IO_STARTED, &sh->state);
3113
3097 bi = &sh->dev[i].req; 3114 bi = &sh->dev[i].req;
3098 3115
3099 bi->bi_rw = rw; 3116 bi->bi_rw = rw;
@@ -3164,7 +3181,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
3164 clear_bit(STRIPE_DELAYED, &sh->state); 3181 clear_bit(STRIPE_DELAYED, &sh->state);
3165 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3182 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3166 atomic_inc(&conf->preread_active_stripes); 3183 atomic_inc(&conf->preread_active_stripes);
3167 list_add_tail(&sh->lru, &conf->handle_list); 3184 list_add_tail(&sh->lru, &conf->hold_list);
3168 } 3185 }
3169 } else 3186 } else
3170 blk_plug_device(conf->mddev->queue); 3187 blk_plug_device(conf->mddev->queue);
@@ -3442,6 +3459,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3442 } 3459 }
3443} 3460}
3444 3461
3462/* __get_priority_stripe - get the next stripe to process
3463 *
3464 * Full stripe writes are allowed to pass preread active stripes up until
3465 * the bypass_threshold is exceeded. In general the bypass_count
3466 * increments when the handle_list is handled before the hold_list; however, it
3467 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3468 * stripe with in flight i/o. The bypass_count will be reset when the
3469 * head of the hold_list has changed, i.e. the head was promoted to the
3470 * handle_list.
3471 */
3472static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3473{
3474 struct stripe_head *sh;
3475
3476 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3477 __func__,
3478 list_empty(&conf->handle_list) ? "empty" : "busy",
3479 list_empty(&conf->hold_list) ? "empty" : "busy",
3480 atomic_read(&conf->pending_full_writes), conf->bypass_count);
3481
3482 if (!list_empty(&conf->handle_list)) {
3483 sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3484
3485 if (list_empty(&conf->hold_list))
3486 conf->bypass_count = 0;
3487 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3488 if (conf->hold_list.next == conf->last_hold)
3489 conf->bypass_count++;
3490 else {
3491 conf->last_hold = conf->hold_list.next;
3492 conf->bypass_count -= conf->bypass_threshold;
3493 if (conf->bypass_count < 0)
3494 conf->bypass_count = 0;
3495 }
3496 }
3497 } else if (!list_empty(&conf->hold_list) &&
3498 ((conf->bypass_threshold &&
3499 conf->bypass_count > conf->bypass_threshold) ||
3500 atomic_read(&conf->pending_full_writes) == 0)) {
3501 sh = list_entry(conf->hold_list.next,
3502 typeof(*sh), lru);
3503 conf->bypass_count -= conf->bypass_threshold;
3504 if (conf->bypass_count < 0)
3505 conf->bypass_count = 0;
3506 } else
3507 return NULL;
3508
3509 list_del_init(&sh->lru);
3510 atomic_inc(&sh->count);
3511 BUG_ON(atomic_read(&sh->count) != 1);
3512 return sh;
3513}
3445 3514
3446static int make_request(struct request_queue *q, struct bio * bi) 3515static int make_request(struct request_queue *q, struct bio * bi)
3447{ 3516{
@@ -3914,7 +3983,6 @@ static void raid5d(mddev_t *mddev)
3914 handled = 0; 3983 handled = 0;
3915 spin_lock_irq(&conf->device_lock); 3984 spin_lock_irq(&conf->device_lock);
3916 while (1) { 3985 while (1) {
3917 struct list_head *first;
3918 struct bio *bio; 3986 struct bio *bio;
3919 3987
3920 if (conf->seq_flush != conf->seq_write) { 3988 if (conf->seq_flush != conf->seq_write) {
@@ -3936,17 +4004,12 @@ static void raid5d(mddev_t *mddev)
3936 handled++; 4004 handled++;
3937 } 4005 }
3938 4006
3939 if (list_empty(&conf->handle_list)) { 4007 sh = __get_priority_stripe(conf);
4008
4009 if (!sh) {
3940 async_tx_issue_pending_all(); 4010 async_tx_issue_pending_all();
3941 break; 4011 break;
3942 } 4012 }
3943
3944 first = conf->handle_list.next;
3945 sh = list_entry(first, struct stripe_head, lru);
3946
3947 list_del_init(first);
3948 atomic_inc(&sh->count);
3949 BUG_ON(atomic_read(&sh->count)!= 1);
3950 spin_unlock_irq(&conf->device_lock); 4013 spin_unlock_irq(&conf->device_lock);
3951 4014
3952 handled++; 4015 handled++;
@@ -4011,6 +4074,42 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4011 raid5_store_stripe_cache_size); 4074 raid5_store_stripe_cache_size);
4012 4075
4013static ssize_t 4076static ssize_t
4077raid5_show_preread_threshold(mddev_t *mddev, char *page)
4078{
4079 raid5_conf_t *conf = mddev_to_conf(mddev);
4080 if (conf)
4081 return sprintf(page, "%d\n", conf->bypass_threshold);
4082 else
4083 return 0;
4084}
4085
4086static ssize_t
4087raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4088{
4089 raid5_conf_t *conf = mddev_to_conf(mddev);
4090 char *end;
4091 int new;
4092 if (len >= PAGE_SIZE)
4093 return -EINVAL;
4094 if (!conf)
4095 return -ENODEV;
4096
4097 new = simple_strtoul(page, &end, 10);
4098 if (!*page || (*end && *end != '\n'))
4099 return -EINVAL;
4100 if (new > conf->max_nr_stripes || new < 0)
4101 return -EINVAL;
4102 conf->bypass_threshold = new;
4103 return len;
4104}
4105
4106static struct md_sysfs_entry
4107raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4108 S_IRUGO | S_IWUSR,
4109 raid5_show_preread_threshold,
4110 raid5_store_preread_threshold);
4111
4112static ssize_t
4014stripe_cache_active_show(mddev_t *mddev, char *page) 4113stripe_cache_active_show(mddev_t *mddev, char *page)
4015{ 4114{
4016 raid5_conf_t *conf = mddev_to_conf(mddev); 4115 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4026,6 +4125,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4026static struct attribute *raid5_attrs[] = { 4125static struct attribute *raid5_attrs[] = {
4027 &raid5_stripecache_size.attr, 4126 &raid5_stripecache_size.attr,
4028 &raid5_stripecache_active.attr, 4127 &raid5_stripecache_active.attr,
4128 &raid5_preread_bypass_threshold.attr,
4029 NULL, 4129 NULL,
4030}; 4130};
4031static struct attribute_group raid5_attrs_group = { 4131static struct attribute_group raid5_attrs_group = {
@@ -4130,12 +4230,14 @@ static int run(mddev_t *mddev)
4130 init_waitqueue_head(&conf->wait_for_stripe); 4230 init_waitqueue_head(&conf->wait_for_stripe);
4131 init_waitqueue_head(&conf->wait_for_overlap); 4231 init_waitqueue_head(&conf->wait_for_overlap);
4132 INIT_LIST_HEAD(&conf->handle_list); 4232 INIT_LIST_HEAD(&conf->handle_list);
4233 INIT_LIST_HEAD(&conf->hold_list);
4133 INIT_LIST_HEAD(&conf->delayed_list); 4234 INIT_LIST_HEAD(&conf->delayed_list);
4134 INIT_LIST_HEAD(&conf->bitmap_list); 4235 INIT_LIST_HEAD(&conf->bitmap_list);
4135 INIT_LIST_HEAD(&conf->inactive_list); 4236 INIT_LIST_HEAD(&conf->inactive_list);
4136 atomic_set(&conf->active_stripes, 0); 4237 atomic_set(&conf->active_stripes, 0);
4137 atomic_set(&conf->preread_active_stripes, 0); 4238 atomic_set(&conf->preread_active_stripes, 0);
4138 atomic_set(&conf->active_aligned_reads, 0); 4239 atomic_set(&conf->active_aligned_reads, 0);
4240 conf->bypass_threshold = BYPASS_THRESHOLD;
4139 4241
4140 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4242 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4141 4243
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 93678f57ccbe..f0827d31ae6f 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -252,6 +252,8 @@ struct r6_state {
252#define STRIPE_EXPANDING 9 252#define STRIPE_EXPANDING 9
253#define STRIPE_EXPAND_SOURCE 10 253#define STRIPE_EXPAND_SOURCE 10
254#define STRIPE_EXPAND_READY 11 254#define STRIPE_EXPAND_READY 11
255#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
256#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
255/* 257/*
256 * Operations flags (in issue order) 258 * Operations flags (in issue order)
257 */ 259 */
@@ -316,12 +318,17 @@ struct raid5_private_data {
316 int previous_raid_disks; 318 int previous_raid_disks;
317 319
318 struct list_head handle_list; /* stripes needing handling */ 320 struct list_head handle_list; /* stripes needing handling */
321 struct list_head hold_list; /* preread ready stripes */
319 struct list_head delayed_list; /* stripes that have plugged requests */ 322 struct list_head delayed_list; /* stripes that have plugged requests */
320 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ 323 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
321 struct bio *retry_read_aligned; /* currently retrying aligned bios */ 324 struct bio *retry_read_aligned; /* currently retrying aligned bios */
322 struct bio *retry_read_aligned_list; /* aligned bios retry list */ 325 struct bio *retry_read_aligned_list; /* aligned bios retry list */
323 atomic_t preread_active_stripes; /* stripes with scheduled io */ 326 atomic_t preread_active_stripes; /* stripes with scheduled io */
324 atomic_t active_aligned_reads; 327 atomic_t active_aligned_reads;
328 atomic_t pending_full_writes; /* full write backlog */
329 int bypass_count; /* bypassed prereads */
330 int bypass_threshold; /* preread nice */
331 struct list_head *last_hold; /* detect hold_list promotions */
325 332
326 atomic_t reshape_stripes; /* stripes with pending writes for reshape */ 333 atomic_t reshape_stripes; /* stripes with pending writes for reshape */
327 /* unfortunately we need two cache names as we temporarily have 334 /* unfortunately we need two cache names as we temporarily have