diff options
author | Dan Williams <dan.j.williams@intel.com> | 2008-04-28 05:15:53 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-28 11:58:42 -0400 |
commit | 8b3e6cdc53b7f29f7026955d6cb6902a49322a15 (patch) | |
tree | 69d9f22a526e687fe2892d327caa3fa6ddd65cf6 | |
parent | e46b272b6608783ed7aa7b0594871550ce20b849 (diff) |
md: introduce get_priority_stripe() to improve raid456 write performance
Improve write performance by preventing the delayed_list from dumping all its
stripes onto the handle_list in one shot. Delayed stripes are now further
delayed by being held on the 'hold_list'. The 'hold_list' is bypassed when:
* a STRIPE_IO_STARTED stripe is found at the head of 'handle_list'
* 'handle_list' is empty and i/o is being done to satisfy full stripe-width
write requests
* 'bypass_count' is less than 'bypass_threshold'. By default the threshold
is 1, i.e. every other stripe handled is a preread stripe provided the
top two conditions are false.
Benchmark data:
System: 2x Xeon 5150, 4x SATA, mem=1GB
Baseline: 2.6.24-rc7
Configuration: mdadm --create /dev/md0 /dev/sd[b-e] -n 4 -l 5 --assume-clean
Test1: dd if=/dev/zero of=/dev/md0 bs=1024k count=2048
* patched: +33% (stripe_cache_size = 256), +25% (stripe_cache_size = 512)
Test2: tiobench --size 2048 --numruns 5 --block 4096 --block 131072 (XFS)
* patched: +13%
* patched + preread_bypass_threshold = 0: +37%
Changes since v1:
* reduce bypass_threshold from (chunk_size / sectors_per_chunk) to (1) and
make it configurable. This defaults to fairness and modest performance
gains out of the box.
Changes since v2:
* [neilb@suse.de]: kill STRIPE_PRIO_HI and preread_needed as they are not
necessary, the important change was clearing STRIPE_DELAYED in
add_stripe_bio and this has been moved out to make_request for the hang
fix.
* [neilb@suse.de]: simplify get_priority_stripe
* [dan.j.williams@intel.com]: reset the bypass_count when ->hold_list is
sampled empty (+11%)
* [dan.j.williams@intel.com]: decrement the bypass_count at the detection
of stripes being naturally promoted off of hold_list +2%. Note, resetting
bypass_count instead of decrementing on these events yields +4% but that is
probably too aggressive.
Changes since v3:
* cosmetic fixups
Tested-by: James W. Laferriere <babydr@baby-dragons.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/md.txt | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 122 | ||||
-rw-r--r-- | include/linux/raid/raid5.h | 7 |
3 files changed, 125 insertions, 10 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index 396cdd982c26..a8b430627473 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
@@ -450,3 +450,9 @@ These currently include | |||
450 | there are upper and lower limits (32768, 16). Default is 128. | 450 | there are upper and lower limits (32768, 16). Default is 128. |
451 | strip_cache_active (currently raid5 only) | 451 | strip_cache_active (currently raid5 only) |
452 | number of active entries in the stripe cache | 452 | number of active entries in the stripe cache |
453 | preread_bypass_threshold (currently raid5 only) | ||
454 | number of times a stripe requiring preread will be bypassed by | ||
455 | a stripe that does not require preread. For fairness defaults | ||
456 | to 1. Setting this to 0 disables bypass accounting and | ||
457 | requires preread stripes to wait until all full-width stripe- | ||
458 | writes are complete. Valid values are 0 to stripe_cache_size. | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4efec467e2f1..45eead608647 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -63,6 +63,7 @@ | |||
63 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | 63 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) |
64 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | 64 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) |
65 | #define IO_THRESHOLD 1 | 65 | #define IO_THRESHOLD 1 |
66 | #define BYPASS_THRESHOLD 1 | ||
66 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) | 67 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) |
67 | #define HASH_MASK (NR_HASH - 1) | 68 | #define HASH_MASK (NR_HASH - 1) |
68 | 69 | ||
@@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh) | |||
398 | 399 | ||
399 | might_sleep(); | 400 | might_sleep(); |
400 | 401 | ||
402 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
401 | for (i = disks; i--; ) { | 403 | for (i = disks; i--; ) { |
402 | int rw; | 404 | int rw; |
403 | struct bio *bi; | 405 | struct bio *bi; |
@@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
1720 | locked++; | 1722 | locked++; |
1721 | } | 1723 | } |
1722 | } | 1724 | } |
1725 | if (locked + 1 == disks) | ||
1726 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
1727 | atomic_inc(&sh->raid_conf->pending_full_writes); | ||
1723 | } else { | 1728 | } else { |
1724 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1729 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1725 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1730 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
@@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | |||
1947 | STRIPE_SECTORS, 0, 0); | 1952 | STRIPE_SECTORS, 0, 0); |
1948 | } | 1953 | } |
1949 | 1954 | ||
1955 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
1956 | if (atomic_dec_and_test(&conf->pending_full_writes)) | ||
1957 | md_wakeup_thread(conf->mddev->thread); | ||
1950 | } | 1958 | } |
1951 | 1959 | ||
1952 | /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks | 1960 | /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks |
@@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf, | |||
2149 | 0); | 2157 | 0); |
2150 | } | 2158 | } |
2151 | } | 2159 | } |
2160 | |||
2161 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2162 | if (atomic_dec_and_test(&conf->pending_full_writes)) | ||
2163 | md_wakeup_thread(conf->mddev->thread); | ||
2152 | } | 2164 | } |
2153 | 2165 | ||
2154 | static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | 2166 | static void handle_issuing_new_write_requests5(raid5_conf_t *conf, |
@@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | |||
2333 | s->locked++; | 2345 | s->locked++; |
2334 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 2346 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
2335 | } | 2347 | } |
2348 | if (s->locked == disks) | ||
2349 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2350 | atomic_inc(&conf->pending_full_writes); | ||
2336 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | 2351 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ |
2337 | set_bit(STRIPE_INSYNC, &sh->state); | 2352 | set_bit(STRIPE_INSYNC, &sh->state); |
2338 | 2353 | ||
@@ -3094,6 +3109,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3094 | else | 3109 | else |
3095 | continue; | 3110 | continue; |
3096 | 3111 | ||
3112 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
3113 | |||
3097 | bi = &sh->dev[i].req; | 3114 | bi = &sh->dev[i].req; |
3098 | 3115 | ||
3099 | bi->bi_rw = rw; | 3116 | bi->bi_rw = rw; |
@@ -3164,7 +3181,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) | |||
3164 | clear_bit(STRIPE_DELAYED, &sh->state); | 3181 | clear_bit(STRIPE_DELAYED, &sh->state); |
3165 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 3182 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3166 | atomic_inc(&conf->preread_active_stripes); | 3183 | atomic_inc(&conf->preread_active_stripes); |
3167 | list_add_tail(&sh->lru, &conf->handle_list); | 3184 | list_add_tail(&sh->lru, &conf->hold_list); |
3168 | } | 3185 | } |
3169 | } else | 3186 | } else |
3170 | blk_plug_device(conf->mddev->queue); | 3187 | blk_plug_device(conf->mddev->queue); |
@@ -3442,6 +3459,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | |||
3442 | } | 3459 | } |
3443 | } | 3460 | } |
3444 | 3461 | ||
3462 | /* __get_priority_stripe - get the next stripe to process | ||
3463 | * | ||
3464 | * Full stripe writes are allowed to pass preread active stripes up until | ||
3465 | * the bypass_threshold is exceeded. In general the bypass_count | ||
3466 | * increments when the handle_list is handled before the hold_list; however, it | ||
3467 | * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a | ||
3468 | * stripe with in flight i/o. The bypass_count will be reset when the | ||
3469 | * head of the hold_list has changed, i.e. the head was promoted to the | ||
3470 | * handle_list. | ||
3471 | */ | ||
3472 | static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) | ||
3473 | { | ||
3474 | struct stripe_head *sh; | ||
3475 | |||
3476 | pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", | ||
3477 | __func__, | ||
3478 | list_empty(&conf->handle_list) ? "empty" : "busy", | ||
3479 | list_empty(&conf->hold_list) ? "empty" : "busy", | ||
3480 | atomic_read(&conf->pending_full_writes), conf->bypass_count); | ||
3481 | |||
3482 | if (!list_empty(&conf->handle_list)) { | ||
3483 | sh = list_entry(conf->handle_list.next, typeof(*sh), lru); | ||
3484 | |||
3485 | if (list_empty(&conf->hold_list)) | ||
3486 | conf->bypass_count = 0; | ||
3487 | else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { | ||
3488 | if (conf->hold_list.next == conf->last_hold) | ||
3489 | conf->bypass_count++; | ||
3490 | else { | ||
3491 | conf->last_hold = conf->hold_list.next; | ||
3492 | conf->bypass_count -= conf->bypass_threshold; | ||
3493 | if (conf->bypass_count < 0) | ||
3494 | conf->bypass_count = 0; | ||
3495 | } | ||
3496 | } | ||
3497 | } else if (!list_empty(&conf->hold_list) && | ||
3498 | ((conf->bypass_threshold && | ||
3499 | conf->bypass_count > conf->bypass_threshold) || | ||
3500 | atomic_read(&conf->pending_full_writes) == 0)) { | ||
3501 | sh = list_entry(conf->hold_list.next, | ||
3502 | typeof(*sh), lru); | ||
3503 | conf->bypass_count -= conf->bypass_threshold; | ||
3504 | if (conf->bypass_count < 0) | ||
3505 | conf->bypass_count = 0; | ||
3506 | } else | ||
3507 | return NULL; | ||
3508 | |||
3509 | list_del_init(&sh->lru); | ||
3510 | atomic_inc(&sh->count); | ||
3511 | BUG_ON(atomic_read(&sh->count) != 1); | ||
3512 | return sh; | ||
3513 | } | ||
3445 | 3514 | ||
3446 | static int make_request(struct request_queue *q, struct bio * bi) | 3515 | static int make_request(struct request_queue *q, struct bio * bi) |
3447 | { | 3516 | { |
@@ -3914,7 +3983,6 @@ static void raid5d(mddev_t *mddev) | |||
3914 | handled = 0; | 3983 | handled = 0; |
3915 | spin_lock_irq(&conf->device_lock); | 3984 | spin_lock_irq(&conf->device_lock); |
3916 | while (1) { | 3985 | while (1) { |
3917 | struct list_head *first; | ||
3918 | struct bio *bio; | 3986 | struct bio *bio; |
3919 | 3987 | ||
3920 | if (conf->seq_flush != conf->seq_write) { | 3988 | if (conf->seq_flush != conf->seq_write) { |
@@ -3936,17 +4004,12 @@ static void raid5d(mddev_t *mddev) | |||
3936 | handled++; | 4004 | handled++; |
3937 | } | 4005 | } |
3938 | 4006 | ||
3939 | if (list_empty(&conf->handle_list)) { | 4007 | sh = __get_priority_stripe(conf); |
4008 | |||
4009 | if (!sh) { | ||
3940 | async_tx_issue_pending_all(); | 4010 | async_tx_issue_pending_all(); |
3941 | break; | 4011 | break; |
3942 | } | 4012 | } |
3943 | |||
3944 | first = conf->handle_list.next; | ||
3945 | sh = list_entry(first, struct stripe_head, lru); | ||
3946 | |||
3947 | list_del_init(first); | ||
3948 | atomic_inc(&sh->count); | ||
3949 | BUG_ON(atomic_read(&sh->count)!= 1); | ||
3950 | spin_unlock_irq(&conf->device_lock); | 4013 | spin_unlock_irq(&conf->device_lock); |
3951 | 4014 | ||
3952 | handled++; | 4015 | handled++; |
@@ -4011,6 +4074,42 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, | |||
4011 | raid5_store_stripe_cache_size); | 4074 | raid5_store_stripe_cache_size); |
4012 | 4075 | ||
4013 | static ssize_t | 4076 | static ssize_t |
4077 | raid5_show_preread_threshold(mddev_t *mddev, char *page) | ||
4078 | { | ||
4079 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
4080 | if (conf) | ||
4081 | return sprintf(page, "%d\n", conf->bypass_threshold); | ||
4082 | else | ||
4083 | return 0; | ||
4084 | } | ||
4085 | |||
4086 | static ssize_t | ||
4087 | raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) | ||
4088 | { | ||
4089 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
4090 | char *end; | ||
4091 | int new; | ||
4092 | if (len >= PAGE_SIZE) | ||
4093 | return -EINVAL; | ||
4094 | if (!conf) | ||
4095 | return -ENODEV; | ||
4096 | |||
4097 | new = simple_strtoul(page, &end, 10); | ||
4098 | if (!*page || (*end && *end != '\n')) | ||
4099 | return -EINVAL; | ||
4100 | if (new > conf->max_nr_stripes || new < 0) | ||
4101 | return -EINVAL; | ||
4102 | conf->bypass_threshold = new; | ||
4103 | return len; | ||
4104 | } | ||
4105 | |||
4106 | static struct md_sysfs_entry | ||
4107 | raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, | ||
4108 | S_IRUGO | S_IWUSR, | ||
4109 | raid5_show_preread_threshold, | ||
4110 | raid5_store_preread_threshold); | ||
4111 | |||
4112 | static ssize_t | ||
4014 | stripe_cache_active_show(mddev_t *mddev, char *page) | 4113 | stripe_cache_active_show(mddev_t *mddev, char *page) |
4015 | { | 4114 | { |
4016 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4115 | raid5_conf_t *conf = mddev_to_conf(mddev); |
@@ -4026,6 +4125,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); | |||
4026 | static struct attribute *raid5_attrs[] = { | 4125 | static struct attribute *raid5_attrs[] = { |
4027 | &raid5_stripecache_size.attr, | 4126 | &raid5_stripecache_size.attr, |
4028 | &raid5_stripecache_active.attr, | 4127 | &raid5_stripecache_active.attr, |
4128 | &raid5_preread_bypass_threshold.attr, | ||
4029 | NULL, | 4129 | NULL, |
4030 | }; | 4130 | }; |
4031 | static struct attribute_group raid5_attrs_group = { | 4131 | static struct attribute_group raid5_attrs_group = { |
@@ -4130,12 +4230,14 @@ static int run(mddev_t *mddev) | |||
4130 | init_waitqueue_head(&conf->wait_for_stripe); | 4230 | init_waitqueue_head(&conf->wait_for_stripe); |
4131 | init_waitqueue_head(&conf->wait_for_overlap); | 4231 | init_waitqueue_head(&conf->wait_for_overlap); |
4132 | INIT_LIST_HEAD(&conf->handle_list); | 4232 | INIT_LIST_HEAD(&conf->handle_list); |
4233 | INIT_LIST_HEAD(&conf->hold_list); | ||
4133 | INIT_LIST_HEAD(&conf->delayed_list); | 4234 | INIT_LIST_HEAD(&conf->delayed_list); |
4134 | INIT_LIST_HEAD(&conf->bitmap_list); | 4235 | INIT_LIST_HEAD(&conf->bitmap_list); |
4135 | INIT_LIST_HEAD(&conf->inactive_list); | 4236 | INIT_LIST_HEAD(&conf->inactive_list); |
4136 | atomic_set(&conf->active_stripes, 0); | 4237 | atomic_set(&conf->active_stripes, 0); |
4137 | atomic_set(&conf->preread_active_stripes, 0); | 4238 | atomic_set(&conf->preread_active_stripes, 0); |
4138 | atomic_set(&conf->active_aligned_reads, 0); | 4239 | atomic_set(&conf->active_aligned_reads, 0); |
4240 | conf->bypass_threshold = BYPASS_THRESHOLD; | ||
4139 | 4241 | ||
4140 | pr_debug("raid5: run(%s) called.\n", mdname(mddev)); | 4242 | pr_debug("raid5: run(%s) called.\n", mdname(mddev)); |
4141 | 4243 | ||
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 93678f57ccbe..f0827d31ae6f 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -252,6 +252,8 @@ struct r6_state { | |||
252 | #define STRIPE_EXPANDING 9 | 252 | #define STRIPE_EXPANDING 9 |
253 | #define STRIPE_EXPAND_SOURCE 10 | 253 | #define STRIPE_EXPAND_SOURCE 10 |
254 | #define STRIPE_EXPAND_READY 11 | 254 | #define STRIPE_EXPAND_READY 11 |
255 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | ||
256 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | ||
255 | /* | 257 | /* |
256 | * Operations flags (in issue order) | 258 | * Operations flags (in issue order) |
257 | */ | 259 | */ |
@@ -316,12 +318,17 @@ struct raid5_private_data { | |||
316 | int previous_raid_disks; | 318 | int previous_raid_disks; |
317 | 319 | ||
318 | struct list_head handle_list; /* stripes needing handling */ | 320 | struct list_head handle_list; /* stripes needing handling */ |
321 | struct list_head hold_list; /* preread ready stripes */ | ||
319 | struct list_head delayed_list; /* stripes that have plugged requests */ | 322 | struct list_head delayed_list; /* stripes that have plugged requests */ |
320 | struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ | 323 | struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ |
321 | struct bio *retry_read_aligned; /* currently retrying aligned bios */ | 324 | struct bio *retry_read_aligned; /* currently retrying aligned bios */ |
322 | struct bio *retry_read_aligned_list; /* aligned bios retry list */ | 325 | struct bio *retry_read_aligned_list; /* aligned bios retry list */ |
323 | atomic_t preread_active_stripes; /* stripes with scheduled io */ | 326 | atomic_t preread_active_stripes; /* stripes with scheduled io */ |
324 | atomic_t active_aligned_reads; | 327 | atomic_t active_aligned_reads; |
328 | atomic_t pending_full_writes; /* full write backlog */ | ||
329 | int bypass_count; /* bypassed prereads */ | ||
330 | int bypass_threshold; /* preread nice */ | ||
331 | struct list_head *last_hold; /* detect hold_list promotions */ | ||
325 | 332 | ||
326 | atomic_t reshape_stripes; /* stripes with pending writes for reshape */ | 333 | atomic_t reshape_stripes; /* stripes with pending writes for reshape */ |
327 | /* unfortunately we need two cache names as we temporarily have | 334 | /* unfortunately we need two cache names as we temporarily have |