Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2008-06-11 16:50:36 -0400
committer: Chris Mason <chris.mason@oracle.com> 2008-09-25 11:04:03 -0400
commit: 8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree: 982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/disk-io.c
parent: 43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)
1 files changed, 82 insertions, 118 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9a53646ceb2..98ff4fbcb386 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "print-tree.h"
+#include "async-thread.h"
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 #endif
 static struct extent_io_ops btree_extent_io_ops;
-static struct workqueue_struct *end_io_workqueue;
+static void end_workqueue_fn(struct btrfs_work *work);
-static struct workqueue_struct *async_submit_workqueue;
 struct end_io_wq {
        struct bio *bio;
@@ -57,6 +57,7 @@ struct end_io_wq {
        int error;
        int metadata;
        struct list_head list;
+        struct btrfs_work work;
 };
 struct async_submit_bio {
@@ -66,6 +67,7 @@ struct async_submit_bio {
        extent_submit_bio_hook_t *submit_bio_hook;
        int rw;
        int mirror_num;
+        struct btrfs_work work;
 };
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
@@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio,
 {
        struct end_io_wq *end_io_wq = bio->bi_private;
        struct btrfs_fs_info *fs_info;
-        unsigned long flags;
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
        if (bio->bi_size)
@@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio,
 #endif
        fs_info = end_io_wq->info;
-        spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
        end_io_wq->error = err;
-        list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);
+        end_io_wq->work.func = end_workqueue_fn;
-        spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
+        end_io_wq->work.flags = 0;
-        queue_work(end_io_workqueue, &fs_info->end_io_work);
+        btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
        return 0;
@@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
        return 0;
 }
+static void run_one_async_submit(struct btrfs_work *work)
+{
+        struct btrfs_fs_info *fs_info;
+        struct async_submit_bio *async;
+        async = container_of(work, struct  async_submit_bio, work);
+        fs_info = BTRFS_I(async->inode)->root->fs_info;
+        atomic_dec(&fs_info->nr_async_submits);
+        async->submit_bio_hook(async->inode, async->rw, async->bio,
+                               async->mirror_num);
+        kfree(async);
+}
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
                        extent_submit_bio_hook_t *submit_bio_hook)
@@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio = bio;
        async->mirror_num = mirror_num;
        async->submit_bio_hook = submit_bio_hook;
+        async->work.func = run_one_async_submit;
-        spin_lock(&fs_info->async_submit_work_lock);
+        async->work.flags = 0;
-        list_add_tail(&async->list, &fs_info->async_submit_work_list);
        atomic_inc(&fs_info->nr_async_submits);
-        spin_unlock(&fs_info->async_submit_work_lock);
+        btrfs_queue_worker(&fs_info->workers, &async->work);
-        queue_work(async_submit_workqueue, &fs_info->async_submit_work);
        return 0;
 }
@@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        offset = bio->bi_sector << 9;
+        /*
+         * when we're called for a write, we're already in the async
+         * submission context.  Just jump ingo btrfs_map_bio
+         */
        if (rw & (1 << BIO_RW)) {
-                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
+                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                     mirror_num, 0);
        }
+        /*
+         * called for a read, do the setup so that checksum validation
+         * can happen in the async kernel threads
+         */
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
        BUG_ON(ret);
-        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
+        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                 int mirror_num)
 {
+        /*
+         * kthread helpers are used to submit writes so that checksumming
+         * can happen in parallel across all CPUs
+         */
        if (!(rw & (1 << BIO_RW))) {
                return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
        }
@@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio)
        return ret;
 }
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+/*
-static void btrfs_end_io_csum(void *p)
+ * called by the kthread helper functions to finally call the bio end_io
-#else
+ * functions.  This is where read checksum verification actually happens
-static void btrfs_end_io_csum(struct work_struct *work)
+ */
-#endif
+static void end_workqueue_fn(struct btrfs_work *work)
 {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-        struct btrfs_fs_info *fs_info = p;
-#else
-        struct btrfs_fs_info *fs_info = container_of(work,
-                                                     struct btrfs_fs_info,
-                                                     end_io_work);
-#endif
-        unsigned long flags;
-        struct end_io_wq *end_io_wq;
        struct bio *bio;
-        struct list_head *next;
+        struct end_io_wq *end_io_wq;
+        struct btrfs_fs_info *fs_info;
        int error;
-        int was_empty;
-        while(1) {
+        end_io_wq = container_of(work, struct end_io_wq, work);
-                spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+        bio = end_io_wq->bio;
-                if (list_empty(&fs_info->end_io_work_list)) {
+        fs_info = end_io_wq->info;
-                        spin_unlock_irqrestore(&fs_info->end_io_work_lock,
-                                               flags);
-                        return;
-                }
-                next = fs_info->end_io_work_list.next;
-                list_del(next);
-                spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
-                end_io_wq = list_entry(next, struct end_io_wq, list);
-                bio = end_io_wq->bio;
-                if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-                        spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
-                        was_empty = list_empty(&fs_info->end_io_work_list);
-                        list_add_tail(&end_io_wq->list,
-                                      &fs_info->end_io_work_list);
-                        spin_unlock_irqrestore(&fs_info->end_io_work_lock,
-                                               flags);
-                        if (was_empty)
-                                return;
-                        continue;
-                }
-                error = end_io_wq->error;
-                bio->bi_private = end_io_wq->private;
-                bio->bi_end_io = end_io_wq->end_io;
-                kfree(end_io_wq);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-                bio_endio(bio, bio->bi_size, error);
-#else
-                bio_endio(bio, error);
-#endif
-        }
-}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+        /* metadata bios are special because the whole tree block must
-static void btrfs_async_submit_work(void *p)
+         * be checksummed at once.  This makes sure the entire block is in
-#else
+         * ram and up to date before trying to verify things.  For
-static void btrfs_async_submit_work(struct work_struct *work)
+         * blocksize <= pagesize, it is basically a noop
-#endif
+         */
-{
+        if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+                btrfs_queue_worker(&fs_info->endio_workers,
-        struct btrfs_fs_info *fs_info = p;
+                                   &end_io_wq->work);
+                return;
+        }
+        error = end_io_wq->error;
+        bio->bi_private = end_io_wq->private;
+        bio->bi_end_io = end_io_wq->end_io;
+        kfree(end_io_wq);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+        bio_endio(bio, bio->bi_size, error);
 #else
-        struct btrfs_fs_info *fs_info = container_of(work,
+        bio_endio(bio, error);
-                                                     struct btrfs_fs_info,
-                                                     async_submit_work);
 #endif
-        struct async_submit_bio *async;
-        struct list_head *next;
-        while(1) {
-                spin_lock(&fs_info->async_submit_work_lock);
-                if (list_empty(&fs_info->async_submit_work_list)) {
-                        spin_unlock(&fs_info->async_submit_work_lock);
-                        return;
-                }
-                next = fs_info->async_submit_work_list.next;
-                list_del(next);
-                atomic_dec(&fs_info->nr_async_submits);
-                spin_unlock(&fs_info->async_submit_work_lock);
-                async = list_entry(next, struct async_submit_bio, list);
-                async->submit_bio_hook(async->inode, async->rw, async->bio,
-                                       async->mirror_num);
-                kfree(async);
-        }
 }
 struct btrfs_root *open_ctree(struct super_block *sb,
@@ -1155,19 +1123,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                err = -ENOMEM;
                goto fail;
        }
-        end_io_workqueue = create_workqueue("btrfs-end-io");
-        BUG_ON(!end_io_workqueue);
-        async_submit_workqueue = create_workqueue("btrfs-async-submit");
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
-        INIT_LIST_HEAD(&fs_info->end_io_work_list);
-        INIT_LIST_HEAD(&fs_info->async_submit_work_list);
        spin_lock_init(&fs_info->hash_lock);
-        spin_lock_init(&fs_info->end_io_work_lock);
-        spin_lock_init(&fs_info->async_submit_work_lock);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
@@ -1222,13 +1182,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->do_barriers = 1;
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-        INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
-        INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work,
-                  fs_info);
        INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
-        INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
-        INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work);
        INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 #endif
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1240,6 +1195,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->trans_mutex);
        mutex_init(&fs_info->fs_mutex);
+        /* we need to start all the end_io workers up front because the
+         * queue work function gets called at interrupt time.  The endio
+         * workers don't normally start IO, so some number of them <= the
+         * number of cpus is fine.  They handle checksumming after a read.
+         *
+         * The other worker threads do start IO, so the max is larger than
+         * the number of CPUs.  FIXME, tune this for huge machines
+         */
+        btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2);
+        btrfs_init_workers(&fs_info->endio_workers, num_online_cpus());
+        btrfs_start_workers(&fs_info->workers, 1);
+        btrfs_start_workers(&fs_info->endio_workers, num_online_cpus());
 #if 0
        ret = add_hasher(fs_info, "crc32c");
        if (ret) {
@@ -1375,6 +1343,8 @@ fail_sb_buffer:
        extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
        iput(fs_info->btree_inode);
+        btrfs_stop_workers(&fs_info->workers);
+        btrfs_stop_workers(&fs_info->endio_workers);
 fail:
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root)
        extent_io_tree_empty_lru(&fs_info->extent_ins);
        extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
-        flush_workqueue(async_submit_workqueue);
-        flush_workqueue(end_io_workqueue);
        truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
-        flush_workqueue(async_submit_workqueue);
+        btrfs_stop_workers(&fs_info->workers);
-        destroy_workqueue(async_submit_workqueue);
+        btrfs_stop_workers(&fs_info->endio_workers);
-        flush_workqueue(end_io_workqueue);
-        destroy_workqueue(end_io_workqueue);
        iput(fs_info->btree_inode);
 #if 0
author	Chris Mason <chris.mason@oracle.com>	2008-06-11 16:50:36 -0400
committer	Chris Mason <chris.mason@oracle.com>	2008-09-25 11:04:03 -0400
commit	8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree	982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/disk-io.c
parent	43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b9a53646ceb2..98ff4fbcb386 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
31	#include "btrfs_inode.h"	31	#include "btrfs_inode.h"
32	#include "volumes.h"	32	#include "volumes.h"
33	#include "print-tree.h"	33	#include "print-tree.h"
		34	#include "async-thread.h"
34		35
35	#if 0	36	#if 0
36	static int check_tree_block(struct btrfs_root root, struct extent_buffer buf)	37	static int check_tree_block(struct btrfs_root root, struct extent_buffer buf)
@@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root root, struct extent_buffer buf)
46	#endif	47	#endif
47		48
48	static struct extent_io_ops btree_extent_io_ops;	49	static struct extent_io_ops btree_extent_io_ops;
49	static struct workqueue_struct *end_io_workqueue;	50	static void end_workqueue_fn(struct btrfs_work *work);
50	static struct workqueue_struct *async_submit_workqueue;
51		51
52	struct end_io_wq {	52	struct end_io_wq {
53	struct bio *bio;	53	struct bio *bio;
@@ -57,6 +57,7 @@ struct end_io_wq {
57	int error;	57	int error;
58	int metadata;	58	int metadata;
59	struct list_head list;	59	struct list_head list;
		60	struct btrfs_work work;
60	};	61	};
61		62
62	struct async_submit_bio {	63	struct async_submit_bio {
@@ -66,6 +67,7 @@ struct async_submit_bio {
66	extent_submit_bio_hook_t *submit_bio_hook;	67	extent_submit_bio_hook_t *submit_bio_hook;
67	int rw;	68	int rw;
68	int mirror_num;	69	int mirror_num;
		70	struct btrfs_work work;
69	};	71	};
70		72
71	struct extent_map btree_get_extent(struct inode inode, struct page *page,	73	struct extent_map btree_get_extent(struct inode inode, struct page *page,
@@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio,
389	{	391	{
390	struct end_io_wq *end_io_wq = bio->bi_private;	392	struct end_io_wq *end_io_wq = bio->bi_private;
391	struct btrfs_fs_info *fs_info;	393	struct btrfs_fs_info *fs_info;
392	unsigned long flags;
393		394
394	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)	395	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
395	if (bio->bi_size)	396	if (bio->bi_size)
@@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio,
397	#endif	398	#endif
398		399
399	fs_info = end_io_wq->info;	400	fs_info = end_io_wq->info;
400	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
401	end_io_wq->error = err;	401	end_io_wq->error = err;
402	list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);	402	end_io_wq->work.func = end_workqueue_fn;
403	spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);	403	end_io_wq->work.flags = 0;
404	queue_work(end_io_workqueue, &fs_info->end_io_work);	404	btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
405		405
406	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)	406	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
407	return 0;	407	return 0;
@@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info info, struct bio bio,
428	return 0;	428	return 0;
429	}	429	}
430		430
		431	static void run_one_async_submit(struct btrfs_work *work)
		432	{
		433	struct btrfs_fs_info *fs_info;
		434	struct async_submit_bio *async;
		435
		436	async = container_of(work, struct async_submit_bio, work);
		437	fs_info = BTRFS_I(async->inode)->root->fs_info;
		438	atomic_dec(&fs_info->nr_async_submits);
		439	async->submit_bio_hook(async->inode, async->rw, async->bio,
		440	async->mirror_num);
		441	kfree(async);
		442	}
		443
431	int btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct inode inode,	444	int btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct inode inode,
432	int rw, struct bio *bio, int mirror_num,	445	int rw, struct bio *bio, int mirror_num,
433	extent_submit_bio_hook_t *submit_bio_hook)	446	extent_submit_bio_hook_t *submit_bio_hook)
@@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct inode inode,
443	async->bio = bio;	456	async->bio = bio;
444	async->mirror_num = mirror_num;	457	async->mirror_num = mirror_num;
445	async->submit_bio_hook = submit_bio_hook;	458	async->submit_bio_hook = submit_bio_hook;
446		459	async->work.func = run_one_async_submit;
447	spin_lock(&fs_info->async_submit_work_lock);	460	async->work.flags = 0;
448	list_add_tail(&async->list, &fs_info->async_submit_work_list);
449	atomic_inc(&fs_info->nr_async_submits);	461	atomic_inc(&fs_info->nr_async_submits);
450	spin_unlock(&fs_info->async_submit_work_lock);	462	btrfs_queue_worker(&fs_info->workers, &async->work);
451
452	queue_work(async_submit_workqueue, &fs_info->async_submit_work);
453	return 0;	463	return 0;
454	}	464	}
455		465
@@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode inode, int rw, struct bio bio,
462		472
463	offset = bio->bi_sector << 9;	473	offset = bio->bi_sector << 9;
464		474
		475	/*
		476	* when we're called for a write, we're already in the async
		477	* submission context. Just jump ingo btrfs_map_bio
		478	*/
465	if (rw & (1 << BIO_RW)) {	479	if (rw & (1 << BIO_RW)) {
466	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);	480	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
		481	mirror_num, 0);
467	}	482	}
468		483
		484	/*
		485	* called for a read, do the setup so that checksum validation
		486	* can happen in the async kernel threads
		487	*/
469	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);	488	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
470	BUG_ON(ret);	489	BUG_ON(ret);
471		490
472	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);	491	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
473	}	492	}
474		493
475	static int btree_submit_bio_hook(struct inode inode, int rw, struct bio bio,	494	static int btree_submit_bio_hook(struct inode inode, int rw, struct bio bio,
476	int mirror_num)	495	int mirror_num)
477	{	496	{
		497	/*
		498	* kthread helpers are used to submit writes so that checksumming
		499	* can happen in parallel across all CPUs
		500	*/
478	if (!(rw & (1 << BIO_RW))) {	501	if (!(rw & (1 << BIO_RW))) {
479	return __btree_submit_bio_hook(inode, rw, bio, mirror_num);	502	return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
480	}	503	}
@@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio)
1036	return ret;	1059	return ret;
1037	}	1060	}
1038		1061
1039	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)	1062	/*
1040	static void btrfs_end_io_csum(void *p)	1063	* called by the kthread helper functions to finally call the bio end_io
1041	#else	1064	* functions. This is where read checksum verification actually happens
1042	static void btrfs_end_io_csum(struct work_struct *work)	1065	*/
1043	#endif	1066	static void end_workqueue_fn(struct btrfs_work *work)
1044	{	1067	{
1045	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1046	struct btrfs_fs_info *fs_info = p;
1047	#else
1048	struct btrfs_fs_info *fs_info = container_of(work,
1049	struct btrfs_fs_info,
1050	end_io_work);
1051	#endif
1052	unsigned long flags;
1053	struct end_io_wq *end_io_wq;
1054	struct bio *bio;	1068	struct bio *bio;
1055	struct list_head *next;	1069	struct end_io_wq *end_io_wq;
		1070	struct btrfs_fs_info *fs_info;
1056	int error;	1071	int error;
1057	int was_empty;
1058		1072
1059	while(1) {	1073	end_io_wq = container_of(work, struct end_io_wq, work);
1060	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);	1074	bio = end_io_wq->bio;
1061	if (list_empty(&fs_info->end_io_work_list)) {	1075	fs_info = end_io_wq->info;
1062	spin_unlock_irqrestore(&fs_info->end_io_work_lock,
1063	flags);
1064	return;
1065	}
1066	next = fs_info->end_io_work_list.next;
1067	list_del(next);
1068	spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
1069
1070	end_io_wq = list_entry(next, struct end_io_wq, list);
1071
1072	bio = end_io_wq->bio;
1073	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1074	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
1075	was_empty = list_empty(&fs_info->end_io_work_list);
1076	list_add_tail(&end_io_wq->list,
1077	&fs_info->end_io_work_list);
1078	spin_unlock_irqrestore(&fs_info->end_io_work_lock,
1079	flags);
1080	if (was_empty)
1081	return;
1082	continue;
1083	}
1084	error = end_io_wq->error;
1085	bio->bi_private = end_io_wq->private;
1086	bio->bi_end_io = end_io_wq->end_io;
1087	kfree(end_io_wq);
1088	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1089	bio_endio(bio, bio->bi_size, error);
1090	#else
1091	bio_endio(bio, error);
1092	#endif
1093	}
1094	}
1095		1076
1096	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)	1077	/* metadata bios are special because the whole tree block must
1097	static void btrfs_async_submit_work(void *p)	1078	* be checksummed at once. This makes sure the entire block is in
1098	#else	1079	* ram and up to date before trying to verify things. For
1099	static void btrfs_async_submit_work(struct work_struct *work)	1080	* blocksize <= pagesize, it is basically a noop
1100	#endif	1081	*/
1101	{	1082	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1102	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)	1083	btrfs_queue_worker(&fs_info->endio_workers,
1103	struct btrfs_fs_info *fs_info = p;	1084	&end_io_wq->work);
		1085	return;
		1086	}
		1087	error = end_io_wq->error;
		1088	bio->bi_private = end_io_wq->private;
		1089	bio->bi_end_io = end_io_wq->end_io;
		1090	kfree(end_io_wq);
		1091	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
		1092	bio_endio(bio, bio->bi_size, error);
1104	#else	1093	#else
1105	struct btrfs_fs_info *fs_info = container_of(work,	1094	bio_endio(bio, error);
1106	struct btrfs_fs_info,
1107	async_submit_work);
1108	#endif	1095	#endif
1109	struct async_submit_bio *async;
1110	struct list_head *next;
1111
1112	while(1) {
1113	spin_lock(&fs_info->async_submit_work_lock);
1114	if (list_empty(&fs_info->async_submit_work_list)) {
1115	spin_unlock(&fs_info->async_submit_work_lock);
1116	return;
1117	}
1118	next = fs_info->async_submit_work_list.next;
1119	list_del(next);
1120	atomic_dec(&fs_info->nr_async_submits);
1121	spin_unlock(&fs_info->async_submit_work_lock);
1122
1123	async = list_entry(next, struct async_submit_bio, list);
1124	async->submit_bio_hook(async->inode, async->rw, async->bio,
1125	async->mirror_num);
1126	kfree(async);
1127	}
1128	}	1096	}
1129		1097
1130	struct btrfs_root open_ctree(struct super_block sb,	1098	struct btrfs_root open_ctree(struct super_block sb,
@@ -1155,19 +1123,11 @@ struct btrfs_root open_ctree(struct super_block sb,
1155	err = -ENOMEM;	1123	err = -ENOMEM;
1156	goto fail;	1124	goto fail;
1157	}	1125	}
1158	end_io_workqueue = create_workqueue("btrfs-end-io");
1159	BUG_ON(!end_io_workqueue);
1160	async_submit_workqueue = create_workqueue("btrfs-async-submit");
1161
1162	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);	1126	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1163	INIT_LIST_HEAD(&fs_info->trans_list);	1127	INIT_LIST_HEAD(&fs_info->trans_list);
1164	INIT_LIST_HEAD(&fs_info->dead_roots);	1128	INIT_LIST_HEAD(&fs_info->dead_roots);
1165	INIT_LIST_HEAD(&fs_info->hashers);	1129	INIT_LIST_HEAD(&fs_info->hashers);
1166	INIT_LIST_HEAD(&fs_info->end_io_work_list);
1167	INIT_LIST_HEAD(&fs_info->async_submit_work_list);
1168	spin_lock_init(&fs_info->hash_lock);	1130	spin_lock_init(&fs_info->hash_lock);
1169	spin_lock_init(&fs_info->end_io_work_lock);
1170	spin_lock_init(&fs_info->async_submit_work_lock);
1171	spin_lock_init(&fs_info->delalloc_lock);	1131	spin_lock_init(&fs_info->delalloc_lock);
1172	spin_lock_init(&fs_info->new_trans_lock);	1132	spin_lock_init(&fs_info->new_trans_lock);
1173		1133
@@ -1222,13 +1182,8 @@ struct btrfs_root open_ctree(struct super_block sb,
1222	fs_info->do_barriers = 1;	1182	fs_info->do_barriers = 1;
1223		1183
1224	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)	1184	#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1225	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
1226	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work,
1227	fs_info);
1228	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);	1185	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
1229	#else	1186	#else
1230	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
1231	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work);
1232	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);	1187	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
1233	#endif	1188	#endif
1234	BTRFS_I(fs_info->btree_inode)->root = tree_root;	1189	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1240,6 +1195,19 @@ struct btrfs_root open_ctree(struct super_block sb,
1240	mutex_init(&fs_info->trans_mutex);	1195	mutex_init(&fs_info->trans_mutex);
1241	mutex_init(&fs_info->fs_mutex);	1196	mutex_init(&fs_info->fs_mutex);
1242		1197
		1198	/* we need to start all the end_io workers up front because the
		1199	* queue work function gets called at interrupt time. The endio
		1200	* workers don't normally start IO, so some number of them <= the
		1201	* number of cpus is fine. They handle checksumming after a read.
		1202	*
		1203	* The other worker threads do start IO, so the max is larger than
		1204	* the number of CPUs. FIXME, tune this for huge machines
		1205	*/
		1206	btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2);
		1207	btrfs_init_workers(&fs_info->endio_workers, num_online_cpus());
		1208	btrfs_start_workers(&fs_info->workers, 1);
		1209	btrfs_start_workers(&fs_info->endio_workers, num_online_cpus());
		1210
1243	#if 0	1211	#if 0
1244	ret = add_hasher(fs_info, "crc32c");	1212	ret = add_hasher(fs_info, "crc32c");
1245	if (ret) {	1213	if (ret) {
@@ -1375,6 +1343,8 @@ fail_sb_buffer:
1375	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);	1343	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
1376	fail_iput:	1344	fail_iput:
1377	iput(fs_info->btree_inode);	1345	iput(fs_info->btree_inode);
		1346	btrfs_stop_workers(&fs_info->workers);
		1347	btrfs_stop_workers(&fs_info->endio_workers);
1378	fail:	1348	fail:
1379	btrfs_close_devices(fs_info->fs_devices);	1349	btrfs_close_devices(fs_info->fs_devices);
1380	btrfs_mapping_tree_free(&fs_info->mapping_tree);	1350	btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root)
1623	extent_io_tree_empty_lru(&fs_info->extent_ins);	1593	extent_io_tree_empty_lru(&fs_info->extent_ins);
1624	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);	1594	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
1625		1595
1626	flush_workqueue(async_submit_workqueue);
1627	flush_workqueue(end_io_workqueue);
1628
1629	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);	1596	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
1630		1597
1631	flush_workqueue(async_submit_workqueue);	1598	btrfs_stop_workers(&fs_info->workers);
1632	destroy_workqueue(async_submit_workqueue);	1599	btrfs_stop_workers(&fs_info->endio_workers);
1633
1634	flush_workqueue(end_io_workqueue);
1635	destroy_workqueue(end_io_workqueue);
1636		1600
1637	iput(fs_info->btree_inode);	1601	iput(fs_info->btree_inode);
1638	#if 0	1602	#if 0