Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2008-06-11 16:50:36 -0400
committer: Chris Mason <chris.mason@oracle.com> 2008-09-25 11:04:03 -0400
commit: 8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree: 982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/volumes.c
parent: 43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)
1 files changed, 157 insertions, 5 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 722eb4550154..c57458ce6339 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "async-thread.h"
 struct map_lookup {
        u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+        struct bio *pending;
+        struct backing_dev_info *bdi;
+        struct bio *tail;
+        struct bio *cur;
+        int again = 0;
+        unsigned long num_run = 0;
+        bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+        spin_lock(&device->io_lock);
+        /* take all the bios off the list at once and process them
+         * later on (without the lock held).  But, remember the
+         * tail and other pointers so the bios can be properly reinserted
+         * into the list if we hit congestion
+         */
+        pending = device->pending_bios;
+        tail = device->pending_bio_tail;
+        WARN_ON(pending && !tail);
+        device->pending_bios = NULL;
+        device->pending_bio_tail = NULL;
+        /*
+         * if pending was null this time around, no bios need processing
+         * at all and we can stop.  Otherwise it'll loop back up again
+         * and do an additional check so no bios are missed.
+         *
+         * device->running_pending is used to synchronize with the
+         * schedule_bio code.
+         */
+        if (pending) {
+                again = 1;
+                device->running_pending = 1;
+        } else {
+                again = 0;
+                device->running_pending = 0;
+        }
+        spin_unlock(&device->io_lock);
+        while(pending) {
+                cur = pending;
+                pending = pending->bi_next;
+                cur->bi_next = NULL;
+                atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+                submit_bio(cur->bi_rw, cur);
+                num_run++;
+                /*
+                 * we made progress, there is more work to do and the bdi
+                 * is now congested.  Back off and let other work structs
+                 * run instead
+                 */
+                if (pending && num_run && bdi_write_congested(bdi)) {
+                        struct bio *old_head;
+                        spin_lock(&device->io_lock);
+                        old_head = device->pending_bios;
+                        device->pending_bios = pending;
+                        if (device->pending_bio_tail)
+                                tail->bi_next = old_head;
+                        else
+                                device->pending_bio_tail = tail;
+                        spin_unlock(&device->io_lock);
+                        btrfs_requeue_work(&device->work);
+                        goto done;
+                }
+        }
+        if (again)
+                goto loop;
+done:
+        return 0;
+}
+void pending_bios_fn(struct btrfs_work *work)
+{
+        struct btrfs_device *device;
+        device = container_of(work, struct btrfs_device, work);
+        run_scheduled_bios(device);
+}
 static int device_list_add(const char *path,
                           struct btrfs_super_block *disk_super,
                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
                        return -ENOMEM;
                }
                device->devid = devid;
+                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
                device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
        device->barriers = 1;
+        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
        spin_lock_init(&device->io_lock);
        device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
+struct async_sched {
+        struct bio *bio;
+        int rw;
+        struct btrfs_fs_info *info;
+        struct btrfs_work work;
+};
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+                 int rw, struct bio *bio)
+{
+        int should_queue = 1;
+        /* don't bother with additional async steps for reads, right now */
+        if (!(rw & (1 << BIO_RW))) {
+                submit_bio(rw, bio);
+                return 0;
+        }
+        /*
+         * nr_async_sumbits allows us to reliably return congestion to the
+         * higher layers.  Otherwise, the async bio makes it appear we have
+         * made progress against dirty pages when we've really just put it
+         * on a queue for later
+         */
+        atomic_inc(&root->fs_info->nr_async_submits);
+        bio->bi_next = NULL;
+        bio->bi_rw |= rw;
+        spin_lock(&device->io_lock);
+        if (device->pending_bio_tail)
+                device->pending_bio_tail->bi_next = bio;
+        device->pending_bio_tail = bio;
+        if (!device->pending_bios)
+                device->pending_bios = bio;
+        if (device->running_pending)
+                should_queue = 0;
+        spin_unlock(&device->io_lock);
+        if (should_queue)
+                btrfs_queue_worker(&root->fs_info->workers, &device->work);
+        return 0;
+}
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-                  int mirror_num)
+                  int mirror_num, int async_submit)
 {
        struct btrfs_mapping_tree *map_tree;
        struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                dev = multi->stripes[dev_nr].dev;
                if (dev && dev->bdev) {
                        bio->bi_bdev = dev->bdev;
-                        spin_lock(&dev->io_lock);
+                        if (async_submit)
-                        dev->total_ios++;
+                                schedule_bio(root, dev, rw, bio);
-                        spin_unlock(&dev->io_lock);
+                        else
-                        submit_bio(rw, bio);
+                                submit_bio(rw, bio);
                } else {
                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                        bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
        device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
+        device->work.func = pending_bios_fn;
        fs_devices->num_devices++;
        spin_lock_init(&device->io_lock);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
author	Chris Mason <chris.mason@oracle.com>	2008-06-11 16:50:36 -0400
committer	Chris Mason <chris.mason@oracle.com>	2008-09-25 11:04:03 -0400
commit	8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree	982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/volumes.c
parent	43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 722eb4550154..c57458ce6339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
27	#include "transaction.h"	27	#include "transaction.h"
28	#include "print-tree.h"	28	#include "print-tree.h"
29	#include "volumes.h"	29	#include "volumes.h"
		30	#include "async-thread.h"
30		31
31	struct map_lookup {	32	struct map_lookup {
32	u64 type;	33	u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices find_fsid(u8 fsid)
110	return NULL;	111	return NULL;
111	}	112	}
112		113
		114	/*
		115	* we try to collect pending bios for a device so we don't get a large
		116	* number of procs sending bios down to the same device. This greatly
		117	* improves the schedulers ability to collect and merge the bios.
		118	*
		119	* But, it also turns into a long list of bios to process and that is sure
		120	* to eventually make the worker thread block. The solution here is to
		121	* make some progress and then put this work struct back at the end of
		122	* the list if the block device is congested. This way, multiple devices
		123	* can make progress from a single worker thread.
		124	*/
		125	int run_scheduled_bios(struct btrfs_device *device)
		126	{
		127	struct bio *pending;
		128	struct backing_dev_info *bdi;
		129	struct bio *tail;
		130	struct bio *cur;
		131	int again = 0;
		132	unsigned long num_run = 0;
		133
		134	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
		135	loop:
		136	spin_lock(&device->io_lock);
		137
		138	/* take all the bios off the list at once and process them
		139	* later on (without the lock held). But, remember the
		140	* tail and other pointers so the bios can be properly reinserted
		141	* into the list if we hit congestion
		142	*/
		143	pending = device->pending_bios;
		144	tail = device->pending_bio_tail;
		145	WARN_ON(pending && !tail);
		146	device->pending_bios = NULL;
		147	device->pending_bio_tail = NULL;
		148
		149	/*
		150	* if pending was null this time around, no bios need processing
		151	* at all and we can stop. Otherwise it'll loop back up again
		152	* and do an additional check so no bios are missed.
		153	*
		154	* device->running_pending is used to synchronize with the
		155	* schedule_bio code.
		156	*/
		157	if (pending) {
		158	again = 1;
		159	device->running_pending = 1;
		160	} else {
		161	again = 0;
		162	device->running_pending = 0;
		163	}
		164	spin_unlock(&device->io_lock);
		165
		166	while(pending) {
		167	cur = pending;
		168	pending = pending->bi_next;
		169	cur->bi_next = NULL;
		170	atomic_dec(&device->dev_root->fs_info->nr_async_submits);
		171	submit_bio(cur->bi_rw, cur);
		172	num_run++;
		173
		174	/*
		175	* we made progress, there is more work to do and the bdi
		176	* is now congested. Back off and let other work structs
		177	* run instead
		178	*/
		179	if (pending && num_run && bdi_write_congested(bdi)) {
		180	struct bio *old_head;
		181
		182	spin_lock(&device->io_lock);
		183	old_head = device->pending_bios;
		184	device->pending_bios = pending;
		185	if (device->pending_bio_tail)
		186	tail->bi_next = old_head;
		187	else
		188	device->pending_bio_tail = tail;
		189
		190	spin_unlock(&device->io_lock);
		191	btrfs_requeue_work(&device->work);
		192	goto done;
		193	}
		194	}
		195	if (again)
		196	goto loop;
		197	done:
		198	return 0;
		199	}
		200
		201	void pending_bios_fn(struct btrfs_work *work)
		202	{
		203	struct btrfs_device *device;
		204
		205	device = container_of(work, struct btrfs_device, work);
		206	run_scheduled_bios(device);
		207	}
		208
113	static int device_list_add(const char *path,	209	static int device_list_add(const char *path,
114	struct btrfs_super_block *disk_super,	210	struct btrfs_super_block *disk_super,
115	u64 devid, struct btrfs_fs_devices **fs_devices_ret)	211	u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
141	return -ENOMEM;	237	return -ENOMEM;
142	}	238	}
143	device->devid = devid;	239	device->devid = devid;
		240	device->work.func = pending_bios_fn;
144	memcpy(device->uuid, disk_super->dev_item.uuid,	241	memcpy(device->uuid, disk_super->dev_item.uuid,
145	BTRFS_UUID_SIZE);	242	BTRFS_UUID_SIZE);
146	device->barriers = 1;	243	device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root root, char device_path)
925	}	1022	}
926		1023
927	device->barriers = 1;	1024	device->barriers = 1;
		1025	device->work.func = pending_bios_fn;
928	generate_random_uuid(device->uuid);	1026	generate_random_uuid(device->uuid);
929	spin_lock_init(&device->io_lock);	1027	spin_lock_init(&device->io_lock);
930	device->name = kstrdup(device_path, GFP_NOFS);	1028	device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
1965	#endif	2063	#endif
1966	}	2064	}
1967		2065
		2066	struct async_sched {
		2067	struct bio *bio;
		2068	int rw;
		2069	struct btrfs_fs_info *info;
		2070	struct btrfs_work work;
		2071	};
		2072
		2073	/*
		2074	* see run_scheduled_bios for a description of why bios are collected for
		2075	* async submit.
		2076	*
		2077	* This will add one bio to the pending list for a device and make sure
		2078	* the work struct is scheduled.
		2079	*/
		2080	int schedule_bio(struct btrfs_root root, struct btrfs_device device,
		2081	int rw, struct bio *bio)
		2082	{
		2083	int should_queue = 1;
		2084
		2085	/* don't bother with additional async steps for reads, right now */
		2086	if (!(rw & (1 << BIO_RW))) {
		2087	submit_bio(rw, bio);
		2088	return 0;
		2089	}
		2090
		2091	/*
		2092	* nr_async_sumbits allows us to reliably return congestion to the
		2093	* higher layers. Otherwise, the async bio makes it appear we have
		2094	* made progress against dirty pages when we've really just put it
		2095	* on a queue for later
		2096	*/
		2097	atomic_inc(&root->fs_info->nr_async_submits);
		2098	bio->bi_next = NULL;
		2099	bio->bi_rw \|= rw;
		2100
		2101	spin_lock(&device->io_lock);
		2102
		2103	if (device->pending_bio_tail)
		2104	device->pending_bio_tail->bi_next = bio;
		2105
		2106	device->pending_bio_tail = bio;
		2107	if (!device->pending_bios)
		2108	device->pending_bios = bio;
		2109	if (device->running_pending)
		2110	should_queue = 0;
		2111
		2112	spin_unlock(&device->io_lock);
		2113
		2114	if (should_queue)
		2115	btrfs_queue_worker(&root->fs_info->workers, &device->work);
		2116	return 0;
		2117	}
		2118
1968	int btrfs_map_bio(struct btrfs_root root, int rw, struct bio bio,	2119	int btrfs_map_bio(struct btrfs_root root, int rw, struct bio bio,
1969	int mirror_num)	2120	int mirror_num, int async_submit)
1970	{	2121	{
1971	struct btrfs_mapping_tree *map_tree;	2122	struct btrfs_mapping_tree *map_tree;
1972	struct btrfs_device *dev;	2123	struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root root, int rw, struct bio bio,
2012	dev = multi->stripes[dev_nr].dev;	2163	dev = multi->stripes[dev_nr].dev;
2013	if (dev && dev->bdev) {	2164	if (dev && dev->bdev) {
2014	bio->bi_bdev = dev->bdev;	2165	bio->bi_bdev = dev->bdev;
2015	spin_lock(&dev->io_lock);	2166	if (async_submit)
2016	dev->total_ios++;	2167	schedule_bio(root, dev, rw, bio);
2017	spin_unlock(&dev->io_lock);	2168	else
2018	submit_bio(rw, bio);	2169	submit_bio(rw, bio);
2019	} else {	2170	} else {
2020	bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;	2171	bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2021	bio->bi_sector = logical >> 9;	2172	bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device add_missing_dev(struct btrfs_root root,
2054	device->barriers = 1;	2205	device->barriers = 1;
2055	device->dev_root = root->fs_info->dev_root;	2206	device->dev_root = root->fs_info->dev_root;
2056	device->devid = devid;	2207	device->devid = devid;
		2208	device->work.func = pending_bios_fn;
2057	fs_devices->num_devices++;	2209	fs_devices->num_devices++;
2058	spin_lock_init(&device->io_lock);	2210	spin_lock_init(&device->io_lock);
2059	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);	2211	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);