aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-06-11 16:50:36 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:03 -0400
commit8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/disk-io.c
parent43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)
Btrfs: Add async worker threads for pre and post IO checksumming
Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c200
1 files changed, 82 insertions, 118 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9a53646ceb2..98ff4fbcb386 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
31#include "btrfs_inode.h" 31#include "btrfs_inode.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "print-tree.h" 33#include "print-tree.h"
34#include "async-thread.h"
34 35
35#if 0 36#if 0
36static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) 37static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
46#endif 47#endif
47 48
48static struct extent_io_ops btree_extent_io_ops; 49static struct extent_io_ops btree_extent_io_ops;
49static struct workqueue_struct *end_io_workqueue; 50static void end_workqueue_fn(struct btrfs_work *work);
50static struct workqueue_struct *async_submit_workqueue;
51 51
52struct end_io_wq { 52struct end_io_wq {
53 struct bio *bio; 53 struct bio *bio;
@@ -57,6 +57,7 @@ struct end_io_wq {
57 int error; 57 int error;
58 int metadata; 58 int metadata;
59 struct list_head list; 59 struct list_head list;
60 struct btrfs_work work;
60}; 61};
61 62
62struct async_submit_bio { 63struct async_submit_bio {
@@ -66,6 +67,7 @@ struct async_submit_bio {
66 extent_submit_bio_hook_t *submit_bio_hook; 67 extent_submit_bio_hook_t *submit_bio_hook;
67 int rw; 68 int rw;
68 int mirror_num; 69 int mirror_num;
70 struct btrfs_work work;
69}; 71};
70 72
71struct extent_map *btree_get_extent(struct inode *inode, struct page *page, 73struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
@@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio,
389{ 391{
390 struct end_io_wq *end_io_wq = bio->bi_private; 392 struct end_io_wq *end_io_wq = bio->bi_private;
391 struct btrfs_fs_info *fs_info; 393 struct btrfs_fs_info *fs_info;
392 unsigned long flags;
393 394
394#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 395#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
395 if (bio->bi_size) 396 if (bio->bi_size)
@@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio,
397#endif 398#endif
398 399
399 fs_info = end_io_wq->info; 400 fs_info = end_io_wq->info;
400 spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
401 end_io_wq->error = err; 401 end_io_wq->error = err;
402 list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list); 402 end_io_wq->work.func = end_workqueue_fn;
403 spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); 403 end_io_wq->work.flags = 0;
404 queue_work(end_io_workqueue, &fs_info->end_io_work); 404 btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
405 405
406#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 406#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
407 return 0; 407 return 0;
@@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
428 return 0; 428 return 0;
429} 429}
430 430
431static void run_one_async_submit(struct btrfs_work *work)
432{
433 struct btrfs_fs_info *fs_info;
434 struct async_submit_bio *async;
435
436 async = container_of(work, struct async_submit_bio, work);
437 fs_info = BTRFS_I(async->inode)->root->fs_info;
438 atomic_dec(&fs_info->nr_async_submits);
439 async->submit_bio_hook(async->inode, async->rw, async->bio,
440 async->mirror_num);
441 kfree(async);
442}
443
431int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 444int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
432 int rw, struct bio *bio, int mirror_num, 445 int rw, struct bio *bio, int mirror_num,
433 extent_submit_bio_hook_t *submit_bio_hook) 446 extent_submit_bio_hook_t *submit_bio_hook)
@@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
443 async->bio = bio; 456 async->bio = bio;
444 async->mirror_num = mirror_num; 457 async->mirror_num = mirror_num;
445 async->submit_bio_hook = submit_bio_hook; 458 async->submit_bio_hook = submit_bio_hook;
446 459 async->work.func = run_one_async_submit;
447 spin_lock(&fs_info->async_submit_work_lock); 460 async->work.flags = 0;
448 list_add_tail(&async->list, &fs_info->async_submit_work_list);
449 atomic_inc(&fs_info->nr_async_submits); 461 atomic_inc(&fs_info->nr_async_submits);
450 spin_unlock(&fs_info->async_submit_work_lock); 462 btrfs_queue_worker(&fs_info->workers, &async->work);
451
452 queue_work(async_submit_workqueue, &fs_info->async_submit_work);
453 return 0; 463 return 0;
454} 464}
455 465
@@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
462 472
463 offset = bio->bi_sector << 9; 473 offset = bio->bi_sector << 9;
464 474
475 /*
476 * when we're called for a write, we're already in the async
477 * submission context. Just jump ingo btrfs_map_bio
478 */
465 if (rw & (1 << BIO_RW)) { 479 if (rw & (1 << BIO_RW)) {
466 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); 480 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
481 mirror_num, 0);
467 } 482 }
468 483
484 /*
485 * called for a read, do the setup so that checksum validation
486 * can happen in the async kernel threads
487 */
469 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); 488 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
470 BUG_ON(ret); 489 BUG_ON(ret);
471 490
472 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); 491 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
473} 492}
474 493
475static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 494static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
476 int mirror_num) 495 int mirror_num)
477{ 496{
497 /*
498 * kthread helpers are used to submit writes so that checksumming
499 * can happen in parallel across all CPUs
500 */
478 if (!(rw & (1 << BIO_RW))) { 501 if (!(rw & (1 << BIO_RW))) {
479 return __btree_submit_bio_hook(inode, rw, bio, mirror_num); 502 return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
480 } 503 }
@@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio)
1036 return ret; 1059 return ret;
1037} 1060}
1038 1061
1039#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) 1062/*
1040static void btrfs_end_io_csum(void *p) 1063 * called by the kthread helper functions to finally call the bio end_io
1041#else 1064 * functions. This is where read checksum verification actually happens
1042static void btrfs_end_io_csum(struct work_struct *work) 1065 */
1043#endif 1066static void end_workqueue_fn(struct btrfs_work *work)
1044{ 1067{
1045#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1046 struct btrfs_fs_info *fs_info = p;
1047#else
1048 struct btrfs_fs_info *fs_info = container_of(work,
1049 struct btrfs_fs_info,
1050 end_io_work);
1051#endif
1052 unsigned long flags;
1053 struct end_io_wq *end_io_wq;
1054 struct bio *bio; 1068 struct bio *bio;
1055 struct list_head *next; 1069 struct end_io_wq *end_io_wq;
1070 struct btrfs_fs_info *fs_info;
1056 int error; 1071 int error;
1057 int was_empty;
1058 1072
1059 while(1) { 1073 end_io_wq = container_of(work, struct end_io_wq, work);
1060 spin_lock_irqsave(&fs_info->end_io_work_lock, flags); 1074 bio = end_io_wq->bio;
1061 if (list_empty(&fs_info->end_io_work_list)) { 1075 fs_info = end_io_wq->info;
1062 spin_unlock_irqrestore(&fs_info->end_io_work_lock,
1063 flags);
1064 return;
1065 }
1066 next = fs_info->end_io_work_list.next;
1067 list_del(next);
1068 spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
1069
1070 end_io_wq = list_entry(next, struct end_io_wq, list);
1071
1072 bio = end_io_wq->bio;
1073 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1074 spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
1075 was_empty = list_empty(&fs_info->end_io_work_list);
1076 list_add_tail(&end_io_wq->list,
1077 &fs_info->end_io_work_list);
1078 spin_unlock_irqrestore(&fs_info->end_io_work_lock,
1079 flags);
1080 if (was_empty)
1081 return;
1082 continue;
1083 }
1084 error = end_io_wq->error;
1085 bio->bi_private = end_io_wq->private;
1086 bio->bi_end_io = end_io_wq->end_io;
1087 kfree(end_io_wq);
1088#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1089 bio_endio(bio, bio->bi_size, error);
1090#else
1091 bio_endio(bio, error);
1092#endif
1093 }
1094}
1095 1076
1096#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) 1077 /* metadata bios are special because the whole tree block must
1097static void btrfs_async_submit_work(void *p) 1078 * be checksummed at once. This makes sure the entire block is in
1098#else 1079 * ram and up to date before trying to verify things. For
1099static void btrfs_async_submit_work(struct work_struct *work) 1080 * blocksize <= pagesize, it is basically a noop
1100#endif 1081 */
1101{ 1082 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1102#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) 1083 btrfs_queue_worker(&fs_info->endio_workers,
1103 struct btrfs_fs_info *fs_info = p; 1084 &end_io_wq->work);
1085 return;
1086 }
1087 error = end_io_wq->error;
1088 bio->bi_private = end_io_wq->private;
1089 bio->bi_end_io = end_io_wq->end_io;
1090 kfree(end_io_wq);
1091#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1092 bio_endio(bio, bio->bi_size, error);
1104#else 1093#else
1105 struct btrfs_fs_info *fs_info = container_of(work, 1094 bio_endio(bio, error);
1106 struct btrfs_fs_info,
1107 async_submit_work);
1108#endif 1095#endif
1109 struct async_submit_bio *async;
1110 struct list_head *next;
1111
1112 while(1) {
1113 spin_lock(&fs_info->async_submit_work_lock);
1114 if (list_empty(&fs_info->async_submit_work_list)) {
1115 spin_unlock(&fs_info->async_submit_work_lock);
1116 return;
1117 }
1118 next = fs_info->async_submit_work_list.next;
1119 list_del(next);
1120 atomic_dec(&fs_info->nr_async_submits);
1121 spin_unlock(&fs_info->async_submit_work_lock);
1122
1123 async = list_entry(next, struct async_submit_bio, list);
1124 async->submit_bio_hook(async->inode, async->rw, async->bio,
1125 async->mirror_num);
1126 kfree(async);
1127 }
1128} 1096}
1129 1097
1130struct btrfs_root *open_ctree(struct super_block *sb, 1098struct btrfs_root *open_ctree(struct super_block *sb,
@@ -1155,19 +1123,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1155 err = -ENOMEM; 1123 err = -ENOMEM;
1156 goto fail; 1124 goto fail;
1157 } 1125 }
1158 end_io_workqueue = create_workqueue("btrfs-end-io");
1159 BUG_ON(!end_io_workqueue);
1160 async_submit_workqueue = create_workqueue("btrfs-async-submit");
1161
1162 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); 1126 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1163 INIT_LIST_HEAD(&fs_info->trans_list); 1127 INIT_LIST_HEAD(&fs_info->trans_list);
1164 INIT_LIST_HEAD(&fs_info->dead_roots); 1128 INIT_LIST_HEAD(&fs_info->dead_roots);
1165 INIT_LIST_HEAD(&fs_info->hashers); 1129 INIT_LIST_HEAD(&fs_info->hashers);
1166 INIT_LIST_HEAD(&fs_info->end_io_work_list);
1167 INIT_LIST_HEAD(&fs_info->async_submit_work_list);
1168 spin_lock_init(&fs_info->hash_lock); 1130 spin_lock_init(&fs_info->hash_lock);
1169 spin_lock_init(&fs_info->end_io_work_lock);
1170 spin_lock_init(&fs_info->async_submit_work_lock);
1171 spin_lock_init(&fs_info->delalloc_lock); 1131 spin_lock_init(&fs_info->delalloc_lock);
1172 spin_lock_init(&fs_info->new_trans_lock); 1132 spin_lock_init(&fs_info->new_trans_lock);
1173 1133
@@ -1222,13 +1182,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1222 fs_info->do_barriers = 1; 1182 fs_info->do_barriers = 1;
1223 1183
1224#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) 1184#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1225 INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
1226 INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work,
1227 fs_info);
1228 INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); 1185 INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
1229#else 1186#else
1230 INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
1231 INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work);
1232 INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner); 1187 INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
1233#endif 1188#endif
1234 BTRFS_I(fs_info->btree_inode)->root = tree_root; 1189 BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1240,6 +1195,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1240 mutex_init(&fs_info->trans_mutex); 1195 mutex_init(&fs_info->trans_mutex);
1241 mutex_init(&fs_info->fs_mutex); 1196 mutex_init(&fs_info->fs_mutex);
1242 1197
1198 /* we need to start all the end_io workers up front because the
1199 * queue work function gets called at interrupt time. The endio
1200 * workers don't normally start IO, so some number of them <= the
1201 * number of cpus is fine. They handle checksumming after a read.
1202 *
1203 * The other worker threads do start IO, so the max is larger than
1204 * the number of CPUs. FIXME, tune this for huge machines
1205 */
1206 btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2);
1207 btrfs_init_workers(&fs_info->endio_workers, num_online_cpus());
1208 btrfs_start_workers(&fs_info->workers, 1);
1209 btrfs_start_workers(&fs_info->endio_workers, num_online_cpus());
1210
1243#if 0 1211#if 0
1244 ret = add_hasher(fs_info, "crc32c"); 1212 ret = add_hasher(fs_info, "crc32c");
1245 if (ret) { 1213 if (ret) {
@@ -1375,6 +1343,8 @@ fail_sb_buffer:
1375 extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); 1343 extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
1376fail_iput: 1344fail_iput:
1377 iput(fs_info->btree_inode); 1345 iput(fs_info->btree_inode);
1346 btrfs_stop_workers(&fs_info->workers);
1347 btrfs_stop_workers(&fs_info->endio_workers);
1378fail: 1348fail:
1379 btrfs_close_devices(fs_info->fs_devices); 1349 btrfs_close_devices(fs_info->fs_devices);
1380 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1350 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root)
1623 extent_io_tree_empty_lru(&fs_info->extent_ins); 1593 extent_io_tree_empty_lru(&fs_info->extent_ins);
1624 extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); 1594 extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
1625 1595
1626 flush_workqueue(async_submit_workqueue);
1627 flush_workqueue(end_io_workqueue);
1628
1629 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); 1596 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
1630 1597
1631 flush_workqueue(async_submit_workqueue); 1598 btrfs_stop_workers(&fs_info->workers);
1632 destroy_workqueue(async_submit_workqueue); 1599 btrfs_stop_workers(&fs_info->endio_workers);
1633
1634 flush_workqueue(end_io_workqueue);
1635 destroy_workqueue(end_io_workqueue);
1636 1600
1637 iput(fs_info->btree_inode); 1601 iput(fs_info->btree_inode);
1638#if 0 1602#if 0