diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-07-31 15:42:53 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-09-25 11:04:05 -0400 |
commit | 61b4944018449003ac5f9757f4d125dce519cf51 (patch) | |
tree | 553855996c641a945344db870b6dfd0d2d02086e | |
parent | 37d1aeee3990385e9bb436c50c2f7e120a668df6 (diff) |
Btrfs: Fix streaming read performance with checksumming on
Large streaming reads make for large bios, which means each entry on the
list async work queues represents a large amount of data. IO
congestion throttling on the device was kicking in before the async
worker threads decided a single thread was busy and needed some help.
The end result was that a streaming read would result in a single CPU
running at 100% instead of balancing the work off to other CPUs.
This patch also changes the pre-IO checksum lookup done by reads to
work on a per-bio basis instead of a per-page. This results in many
extra btree lookups on large streaming reads. Doing the checksum lookup
right before bio submit allows us to reuse searches while processing
adjacent offsets.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | fs/btrfs/async-thread.c | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 15 | ||||
-rw-r--r-- | fs/btrfs/file-item.c | 77 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 57 |
5 files changed, 99 insertions, 54 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fe6a0d532ed..bc2980c433ef 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c | |||
@@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max) | |||
160 | INIT_LIST_HEAD(&workers->idle_list); | 160 | INIT_LIST_HEAD(&workers->idle_list); |
161 | spin_lock_init(&workers->lock); | 161 | spin_lock_init(&workers->lock); |
162 | workers->max_workers = max; | 162 | workers->max_workers = max; |
163 | workers->idle_thresh = 64; | 163 | workers->idle_thresh = 32; |
164 | } | 164 | } |
165 | 165 | ||
166 | /* | 166 | /* |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index be16cd49ef69..d788ab0dcd96 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root | |||
1613 | struct btrfs_key *location, int mod); | 1613 | struct btrfs_key *location, int mod); |
1614 | 1614 | ||
1615 | /* file-item.c */ | 1615 | /* file-item.c */ |
1616 | int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, | ||
1617 | struct bio *bio); | ||
1616 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, | 1618 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, |
1617 | struct btrfs_root *root, | 1619 | struct btrfs_root *root, |
1618 | u64 objectid, u64 pos, u64 disk_offset, | 1620 | u64 objectid, u64 pos, u64 disk_offset, |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e826730d750f..d2d1cc87e8ad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1357 | */ | 1357 | */ |
1358 | btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); | 1358 | btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); |
1359 | btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); | 1359 | btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); |
1360 | |||
1361 | /* a higher idle thresh on the submit workers makes it much more | ||
1362 | * likely that bios will be send down in a sane order to the | ||
1363 | * devices | ||
1364 | */ | ||
1365 | fs_info->submit_workers.idle_thresh = 64; | ||
1366 | |||
1360 | btrfs_init_workers(&fs_info->fixup_workers, 1); | 1367 | btrfs_init_workers(&fs_info->fixup_workers, 1); |
1361 | btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); | 1368 | btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); |
1362 | btrfs_init_workers(&fs_info->endio_write_workers, | 1369 | btrfs_init_workers(&fs_info->endio_write_workers, |
1363 | fs_info->thread_pool_size); | 1370 | fs_info->thread_pool_size); |
1371 | |||
1372 | /* | ||
1373 | * endios are largely parallel and should have a very | ||
1374 | * low idle thresh | ||
1375 | */ | ||
1376 | fs_info->endio_workers.idle_thresh = 4; | ||
1377 | fs_info->endio_write_workers.idle_thresh = 4; | ||
1378 | |||
1364 | btrfs_start_workers(&fs_info->workers, 1); | 1379 | btrfs_start_workers(&fs_info->workers, 1); |
1365 | btrfs_start_workers(&fs_info->submit_workers, 1); | 1380 | btrfs_start_workers(&fs_info->submit_workers, 1); |
1366 | btrfs_start_workers(&fs_info->fixup_workers, 1); | 1381 | btrfs_start_workers(&fs_info->fixup_workers, 1); |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 2311061f070e..a5ff19b34b21 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
@@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | |||
134 | return ret; | 134 | return ret; |
135 | } | 135 | } |
136 | 136 | ||
137 | int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, | ||
138 | struct bio *bio) | ||
139 | { | ||
140 | u32 sum; | ||
141 | struct bio_vec *bvec = bio->bi_io_vec; | ||
142 | int bio_index = 0; | ||
143 | u64 offset; | ||
144 | u64 item_start_offset = 0; | ||
145 | u64 item_last_offset = 0; | ||
146 | u32 diff; | ||
147 | int ret; | ||
148 | struct btrfs_path *path; | ||
149 | struct btrfs_csum_item *item = NULL; | ||
150 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
151 | |||
152 | path = btrfs_alloc_path(); | ||
153 | path->reada = 2; | ||
154 | |||
155 | WARN_ON(bio->bi_vcnt <= 0); | ||
156 | |||
157 | while(bio_index < bio->bi_vcnt) { | ||
158 | offset = page_offset(bvec->bv_page) + bvec->bv_offset; | ||
159 | ret = btrfs_find_ordered_sum(inode, offset, &sum); | ||
160 | if (ret == 0) | ||
161 | goto found; | ||
162 | |||
163 | if (!item || offset < item_start_offset || | ||
164 | offset >= item_last_offset) { | ||
165 | struct btrfs_key found_key; | ||
166 | u32 item_size; | ||
167 | |||
168 | if (item) | ||
169 | btrfs_release_path(root, path); | ||
170 | item = btrfs_lookup_csum(NULL, root, path, | ||
171 | inode->i_ino, offset, 0); | ||
172 | if (IS_ERR(item)) { | ||
173 | ret = PTR_ERR(item); | ||
174 | if (ret == -ENOENT || ret == -EFBIG) | ||
175 | ret = 0; | ||
176 | sum = 0; | ||
177 | printk("no csum found for inode %lu start " | ||
178 | "%llu\n", inode->i_ino, | ||
179 | (unsigned long long)offset); | ||
180 | goto found; | ||
181 | } | ||
182 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
183 | path->slots[0]); | ||
184 | |||
185 | item_start_offset = found_key.offset; | ||
186 | item_size = btrfs_item_size_nr(path->nodes[0], | ||
187 | path->slots[0]); | ||
188 | item_last_offset = item_start_offset + | ||
189 | (item_size / BTRFS_CRC32_SIZE) * | ||
190 | root->sectorsize; | ||
191 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
192 | struct btrfs_csum_item); | ||
193 | } | ||
194 | /* | ||
195 | * this byte range must be able to fit inside | ||
196 | * a single leaf so it will also fit inside a u32 | ||
197 | */ | ||
198 | diff = offset - item_start_offset; | ||
199 | diff = diff / root->sectorsize; | ||
200 | diff = diff * BTRFS_CRC32_SIZE; | ||
201 | |||
202 | read_extent_buffer(path->nodes[0], &sum, | ||
203 | (unsigned long)item + diff, | ||
204 | BTRFS_CRC32_SIZE); | ||
205 | found: | ||
206 | set_state_private(io_tree, offset, sum); | ||
207 | bio_index++; | ||
208 | bvec++; | ||
209 | } | ||
210 | btrfs_free_path(path); | ||
211 | return 0; | ||
212 | } | ||
213 | |||
137 | int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, | 214 | int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, |
138 | struct bio *bio) | 215 | struct bio *bio) |
139 | { | 216 | { |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4afa9d78da9..31d52c51acc3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
374 | BUG_ON(ret); | 374 | BUG_ON(ret); |
375 | 375 | ||
376 | if (!(rw & (1 << BIO_RW))) { | 376 | if (!(rw & (1 << BIO_RW))) { |
377 | if (!btrfs_test_opt(root, NODATASUM) && | ||
378 | !btrfs_test_flag(inode, NODATASUM)) { | ||
379 | btrfs_lookup_bio_sums(root, inode, bio); | ||
380 | } | ||
377 | goto mapit; | 381 | goto mapit; |
378 | } | 382 | } |
379 | 383 | ||
@@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, | |||
598 | return btrfs_finish_ordered_io(page->mapping->host, start, end); | 602 | return btrfs_finish_ordered_io(page->mapping->host, start, end); |
599 | } | 603 | } |
600 | 604 | ||
601 | int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) | ||
602 | { | ||
603 | int ret = 0; | ||
604 | struct inode *inode = page->mapping->host; | ||
605 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
606 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
607 | struct btrfs_csum_item *item; | ||
608 | struct btrfs_path *path = NULL; | ||
609 | u32 csum; | ||
610 | |||
611 | if (btrfs_test_opt(root, NODATASUM) || | ||
612 | btrfs_test_flag(inode, NODATASUM)) | ||
613 | return 0; | ||
614 | |||
615 | /* | ||
616 | * It is possible there is an ordered extent that has | ||
617 | * not yet finished for this range in the file. If so, | ||
618 | * that extent will have a csum cached, and it will insert | ||
619 | * the sum after all the blocks in the extent are fully | ||
620 | * on disk. So, look for an ordered extent and use the | ||
621 | * sum if found. We have to do this before looking in the | ||
622 | * btree because csum items are pre-inserted based on | ||
623 | * the file size. btrfs_lookup_csum might find an item | ||
624 | * that still hasn't been fully filled. | ||
625 | */ | ||
626 | ret = btrfs_find_ordered_sum(inode, start, &csum); | ||
627 | if (ret == 0) | ||
628 | goto found; | ||
629 | |||
630 | ret = 0; | ||
631 | path = btrfs_alloc_path(); | ||
632 | item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0); | ||
633 | if (IS_ERR(item)) { | ||
634 | ret = PTR_ERR(item); | ||
635 | /* a csum that isn't present is a preallocated region. */ | ||
636 | if (ret == -ENOENT || ret == -EFBIG) | ||
637 | ret = 0; | ||
638 | csum = 0; | ||
639 | printk("no csum found for inode %lu start %Lu\n", inode->i_ino, | ||
640 | start); | ||
641 | goto out; | ||
642 | } | ||
643 | read_extent_buffer(path->nodes[0], &csum, (unsigned long)item, | ||
644 | BTRFS_CRC32_SIZE); | ||
645 | found: | ||
646 | set_state_private(io_tree, start, csum); | ||
647 | out: | ||
648 | if (path) | ||
649 | btrfs_free_path(path); | ||
650 | return ret; | ||
651 | } | ||
652 | |||
653 | struct io_failure_record { | 605 | struct io_failure_record { |
654 | struct page *page; | 606 | struct page *page; |
655 | u64 start; | 607 | u64 start; |
@@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { | |||
3613 | .fill_delalloc = run_delalloc_range, | 3565 | .fill_delalloc = run_delalloc_range, |
3614 | .submit_bio_hook = btrfs_submit_bio_hook, | 3566 | .submit_bio_hook = btrfs_submit_bio_hook, |
3615 | .merge_bio_hook = btrfs_merge_bio_hook, | 3567 | .merge_bio_hook = btrfs_merge_bio_hook, |
3616 | .readpage_io_hook = btrfs_readpage_io_hook, | ||
3617 | .readpage_end_io_hook = btrfs_readpage_end_io_hook, | 3568 | .readpage_end_io_hook = btrfs_readpage_end_io_hook, |
3618 | .writepage_end_io_hook = btrfs_writepage_end_io_hook, | 3569 | .writepage_end_io_hook = btrfs_writepage_end_io_hook, |
3619 | .writepage_start_hook = btrfs_writepage_start_hook, | 3570 | .writepage_start_hook = btrfs_writepage_start_hook, |