aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-07-31 15:42:53 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:05 -0400
commit61b4944018449003ac5f9757f4d125dce519cf51 (patch)
tree553855996c641a945344db870b6dfd0d2d02086e
parent37d1aeee3990385e9bb436c50c2f7e120a668df6 (diff)
Btrfs: Fix streaming read performance with checksumming on
Large streaming reads make for large bios, which means each entry on the list async work queues represents a large amount of data. IO congestion throttling on the device was kicking in before the async worker threads decided a single thread was busy and needed some help. The end result was that a streaming read would result in a single CPU running at 100% instead of balancing the work off to other CPUs. This patch also changes the pre-IO checksum lookup done by reads to work on a per-bio basis instead of a per-page. This results in many extra btree lookups on large streaming reads. Doing the checksum lookup right before bio submit allows us to reuse searches while processing adjacent offsets. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c15
-rw-r--r--fs/btrfs/file-item.c77
-rw-r--r--fs/btrfs/inode.c57
5 files changed, 99 insertions, 54 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5fe6a0d532ed..bc2980c433ef 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
160 INIT_LIST_HEAD(&workers->idle_list); 160 INIT_LIST_HEAD(&workers->idle_list);
161 spin_lock_init(&workers->lock); 161 spin_lock_init(&workers->lock);
162 workers->max_workers = max; 162 workers->max_workers = max;
163 workers->idle_thresh = 64; 163 workers->idle_thresh = 32;
164} 164}
165 165
166/* 166/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be16cd49ef69..d788ab0dcd96 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1613 struct btrfs_key *location, int mod); 1613 struct btrfs_key *location, int mod);
1614 1614
1615/* file-item.c */ 1615/* file-item.c */
1616int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1617 struct bio *bio);
1616int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 1618int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1617 struct btrfs_root *root, 1619 struct btrfs_root *root,
1618 u64 objectid, u64 pos, u64 disk_offset, 1620 u64 objectid, u64 pos, u64 disk_offset,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e826730d750f..d2d1cc87e8ad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1357 */ 1357 */
1358 btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); 1358 btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
1359 btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); 1359 btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
1360
1361 /* a higher idle thresh on the submit workers makes it much more
1362 * likely that bios will be send down in a sane order to the
1363 * devices
1364 */
1365 fs_info->submit_workers.idle_thresh = 64;
1366
1360 btrfs_init_workers(&fs_info->fixup_workers, 1); 1367 btrfs_init_workers(&fs_info->fixup_workers, 1);
1361 btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1368 btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1362 btrfs_init_workers(&fs_info->endio_write_workers, 1369 btrfs_init_workers(&fs_info->endio_write_workers,
1363 fs_info->thread_pool_size); 1370 fs_info->thread_pool_size);
1371
1372 /*
1373 * endios are largely parallel and should have a very
1374 * low idle thresh
1375 */
1376 fs_info->endio_workers.idle_thresh = 4;
1377 fs_info->endio_write_workers.idle_thresh = 4;
1378
1364 btrfs_start_workers(&fs_info->workers, 1); 1379 btrfs_start_workers(&fs_info->workers, 1);
1365 btrfs_start_workers(&fs_info->submit_workers, 1); 1380 btrfs_start_workers(&fs_info->submit_workers, 1);
1366 btrfs_start_workers(&fs_info->fixup_workers, 1); 1381 btrfs_start_workers(&fs_info->fixup_workers, 1);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2311061f070e..a5ff19b34b21 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
134 return ret; 134 return ret;
135} 135}
136 136
137int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
138 struct bio *bio)
139{
140 u32 sum;
141 struct bio_vec *bvec = bio->bi_io_vec;
142 int bio_index = 0;
143 u64 offset;
144 u64 item_start_offset = 0;
145 u64 item_last_offset = 0;
146 u32 diff;
147 int ret;
148 struct btrfs_path *path;
149 struct btrfs_csum_item *item = NULL;
150 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
151
152 path = btrfs_alloc_path();
153 path->reada = 2;
154
155 WARN_ON(bio->bi_vcnt <= 0);
156
157 while(bio_index < bio->bi_vcnt) {
158 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
159 ret = btrfs_find_ordered_sum(inode, offset, &sum);
160 if (ret == 0)
161 goto found;
162
163 if (!item || offset < item_start_offset ||
164 offset >= item_last_offset) {
165 struct btrfs_key found_key;
166 u32 item_size;
167
168 if (item)
169 btrfs_release_path(root, path);
170 item = btrfs_lookup_csum(NULL, root, path,
171 inode->i_ino, offset, 0);
172 if (IS_ERR(item)) {
173 ret = PTR_ERR(item);
174 if (ret == -ENOENT || ret == -EFBIG)
175 ret = 0;
176 sum = 0;
177 printk("no csum found for inode %lu start "
178 "%llu\n", inode->i_ino,
179 (unsigned long long)offset);
180 goto found;
181 }
182 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
183 path->slots[0]);
184
185 item_start_offset = found_key.offset;
186 item_size = btrfs_item_size_nr(path->nodes[0],
187 path->slots[0]);
188 item_last_offset = item_start_offset +
189 (item_size / BTRFS_CRC32_SIZE) *
190 root->sectorsize;
191 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
192 struct btrfs_csum_item);
193 }
194 /*
195 * this byte range must be able to fit inside
196 * a single leaf so it will also fit inside a u32
197 */
198 diff = offset - item_start_offset;
199 diff = diff / root->sectorsize;
200 diff = diff * BTRFS_CRC32_SIZE;
201
202 read_extent_buffer(path->nodes[0], &sum,
203 (unsigned long)item + diff,
204 BTRFS_CRC32_SIZE);
205found:
206 set_state_private(io_tree, offset, sum);
207 bio_index++;
208 bvec++;
209 }
210 btrfs_free_path(path);
211 return 0;
212}
213
137int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 214int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
138 struct bio *bio) 215 struct bio *bio)
139{ 216{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c4afa9d78da9..31d52c51acc3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
374 BUG_ON(ret); 374 BUG_ON(ret);
375 375
376 if (!(rw & (1 << BIO_RW))) { 376 if (!(rw & (1 << BIO_RW))) {
377 if (!btrfs_test_opt(root, NODATASUM) &&
378 !btrfs_test_flag(inode, NODATASUM)) {
379 btrfs_lookup_bio_sums(root, inode, bio);
380 }
377 goto mapit; 381 goto mapit;
378 } 382 }
379 383
@@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
598 return btrfs_finish_ordered_io(page->mapping->host, start, end); 602 return btrfs_finish_ordered_io(page->mapping->host, start, end);
599} 603}
600 604
601int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
602{
603 int ret = 0;
604 struct inode *inode = page->mapping->host;
605 struct btrfs_root *root = BTRFS_I(inode)->root;
606 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
607 struct btrfs_csum_item *item;
608 struct btrfs_path *path = NULL;
609 u32 csum;
610
611 if (btrfs_test_opt(root, NODATASUM) ||
612 btrfs_test_flag(inode, NODATASUM))
613 return 0;
614
615 /*
616 * It is possible there is an ordered extent that has
617 * not yet finished for this range in the file. If so,
618 * that extent will have a csum cached, and it will insert
619 * the sum after all the blocks in the extent are fully
620 * on disk. So, look for an ordered extent and use the
621 * sum if found. We have to do this before looking in the
622 * btree because csum items are pre-inserted based on
623 * the file size. btrfs_lookup_csum might find an item
624 * that still hasn't been fully filled.
625 */
626 ret = btrfs_find_ordered_sum(inode, start, &csum);
627 if (ret == 0)
628 goto found;
629
630 ret = 0;
631 path = btrfs_alloc_path();
632 item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
633 if (IS_ERR(item)) {
634 ret = PTR_ERR(item);
635 /* a csum that isn't present is a preallocated region. */
636 if (ret == -ENOENT || ret == -EFBIG)
637 ret = 0;
638 csum = 0;
639 printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
640 start);
641 goto out;
642 }
643 read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
644 BTRFS_CRC32_SIZE);
645found:
646 set_state_private(io_tree, start, csum);
647out:
648 if (path)
649 btrfs_free_path(path);
650 return ret;
651}
652
653struct io_failure_record { 605struct io_failure_record {
654 struct page *page; 606 struct page *page;
655 u64 start; 607 u64 start;
@@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
3613 .fill_delalloc = run_delalloc_range, 3565 .fill_delalloc = run_delalloc_range,
3614 .submit_bio_hook = btrfs_submit_bio_hook, 3566 .submit_bio_hook = btrfs_submit_bio_hook,
3615 .merge_bio_hook = btrfs_merge_bio_hook, 3567 .merge_bio_hook = btrfs_merge_bio_hook,
3616 .readpage_io_hook = btrfs_readpage_io_hook,
3617 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 3568 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
3618 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 3569 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
3619 .writepage_start_hook = btrfs_writepage_start_hook, 3570 .writepage_start_hook = btrfs_writepage_start_hook,