1 files changed, 157 insertions, 5 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 722eb4550154..c57458ce6339 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "async-thread.h"
 struct map_lookup {
        u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+        struct bio *pending;
+        struct backing_dev_info *bdi;
+        struct bio *tail;
+        struct bio *cur;
+        int again = 0;
+        unsigned long num_run = 0;
+        bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+        spin_lock(&device->io_lock);
+        /* take all the bios off the list at once and process them
+         * later on (without the lock held).  But, remember the
+         * tail and other pointers so the bios can be properly reinserted
+         * into the list if we hit congestion
+         */
+        pending = device->pending_bios;
+        tail = device->pending_bio_tail;
+        WARN_ON(pending && !tail);
+        device->pending_bios = NULL;
+        device->pending_bio_tail = NULL;
+        /*
+         * if pending was null this time around, no bios need processing
+         * at all and we can stop.  Otherwise it'll loop back up again
+         * and do an additional check so no bios are missed.
+         *
+         * device->running_pending is used to synchronize with the
+         * schedule_bio code.
+         */
+        if (pending) {
+                again = 1;
+                device->running_pending = 1;
+        } else {
+                again = 0;
+                device->running_pending = 0;
+        }
+        spin_unlock(&device->io_lock);
+        while(pending) {
+                cur = pending;
+                pending = pending->bi_next;
+                cur->bi_next = NULL;
+                atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+                submit_bio(cur->bi_rw, cur);
+                num_run++;
+                /*
+                 * we made progress, there is more work to do and the bdi
+                 * is now congested.  Back off and let other work structs
+                 * run instead
+                 */
+                if (pending && num_run && bdi_write_congested(bdi)) {
+                        struct bio *old_head;
+                        spin_lock(&device->io_lock);
+                        old_head = device->pending_bios;
+                        device->pending_bios = pending;
+                        if (device->pending_bio_tail)
+                                tail->bi_next = old_head;
+                        else
+                                device->pending_bio_tail = tail;
+                        spin_unlock(&device->io_lock);
+                        btrfs_requeue_work(&device->work);
+                        goto done;
+                }
+        }
+        if (again)
+                goto loop;
+done:
+        return 0;
+}
+void pending_bios_fn(struct btrfs_work *work)
+{
+        struct btrfs_device *device;
+        device = container_of(work, struct btrfs_device, work);
+        run_scheduled_bios(device);
+}
 static int device_list_add(const char *path,
                           struct btrfs_super_block *disk_super,
                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
                        return -ENOMEM;
                }
                device->devid = devid;
+                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
                device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
        device->barriers = 1;
+        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
        spin_lock_init(&device->io_lock);
        device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
+struct async_sched {
+        struct bio *bio;
+        int rw;
+        struct btrfs_fs_info *info;
+        struct btrfs_work work;
+};
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+                 int rw, struct bio *bio)
+{
+        int should_queue = 1;
+        /* don't bother with additional async steps for reads, right now */
+        if (!(rw & (1 << BIO_RW))) {
+                submit_bio(rw, bio);
+                return 0;
+        }
+        /*
+         * nr_async_sumbits allows us to reliably return congestion to the
+         * higher layers.  Otherwise, the async bio makes it appear we have
+         * made progress against dirty pages when we've really just put it
+         * on a queue for later
+         */
+        atomic_inc(&root->fs_info->nr_async_submits);
+        bio->bi_next = NULL;
+        bio->bi_rw |= rw;
+        spin_lock(&device->io_lock);
+        if (device->pending_bio_tail)
+                device->pending_bio_tail->bi_next = bio;
+        device->pending_bio_tail = bio;
+        if (!device->pending_bios)
+                device->pending_bios = bio;
+        if (device->running_pending)
+                should_queue = 0;
+        spin_unlock(&device->io_lock);
+        if (should_queue)
+                btrfs_queue_worker(&root->fs_info->workers, &device->work);
+        return 0;
+}
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-                  int mirror_num)
+                  int mirror_num, int async_submit)
 {
        struct btrfs_mapping_tree *map_tree;
        struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                dev = multi->stripes[dev_nr].dev;
                if (dev && dev->bdev) {
                        bio->bi_bdev = dev->bdev;
-                        spin_lock(&dev->io_lock);
+                        if (async_submit)
-                        dev->total_ios++;
+                                schedule_bio(root, dev, rw, bio);
-                        spin_unlock(&dev->io_lock);
+                        else
-                        submit_bio(rw, bio);
+                                submit_bio(rw, bio);
                } else {
                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                        bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
        device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
+        device->work.func = pending_bios_fn;
        fs_devices->num_devices++;
        spin_lock_init(&device->io_lock);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 722eb4550154..c57458ce6339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
27	#include "transaction.h"	27	#include "transaction.h"
28	#include "print-tree.h"	28	#include "print-tree.h"
29	#include "volumes.h"	29	#include "volumes.h"
		30	#include "async-thread.h"
30		31
31	struct map_lookup {	32	struct map_lookup {
32	u64 type;	33	u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices find_fsid(u8 fsid)
110	return NULL;	111	return NULL;
111	}	112	}
112		113
		114	/*
		115	* we try to collect pending bios for a device so we don't get a large
		116	* number of procs sending bios down to the same device. This greatly
		117	* improves the schedulers ability to collect and merge the bios.
		118	*
		119	* But, it also turns into a long list of bios to process and that is sure
		120	* to eventually make the worker thread block. The solution here is to
		121	* make some progress and then put this work struct back at the end of
		122	* the list if the block device is congested. This way, multiple devices
		123	* can make progress from a single worker thread.
		124	*/
		125	int run_scheduled_bios(struct btrfs_device *device)
		126	{
		127	struct bio *pending;
		128	struct backing_dev_info *bdi;
		129	struct bio *tail;
		130	struct bio *cur;
		131	int again = 0;
		132	unsigned long num_run = 0;
		133
		134	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
		135	loop:
		136	spin_lock(&device->io_lock);
		137
		138	/* take all the bios off the list at once and process them
		139	* later on (without the lock held). But, remember the
		140	* tail and other pointers so the bios can be properly reinserted
		141	* into the list if we hit congestion
		142	*/
		143	pending = device->pending_bios;
		144	tail = device->pending_bio_tail;
		145	WARN_ON(pending && !tail);
		146	device->pending_bios = NULL;
		147	device->pending_bio_tail = NULL;
		148
		149	/*
		150	* if pending was null this time around, no bios need processing
		151	* at all and we can stop. Otherwise it'll loop back up again
		152	* and do an additional check so no bios are missed.
		153	*
		154	* device->running_pending is used to synchronize with the
		155	* schedule_bio code.
		156	*/
		157	if (pending) {
		158	again = 1;
		159	device->running_pending = 1;
		160	} else {
		161	again = 0;
		162	device->running_pending = 0;
		163	}
		164	spin_unlock(&device->io_lock);
		165
		166	while(pending) {
		167	cur = pending;
		168	pending = pending->bi_next;
		169	cur->bi_next = NULL;
		170	atomic_dec(&device->dev_root->fs_info->nr_async_submits);
		171	submit_bio(cur->bi_rw, cur);
		172	num_run++;
		173
		174	/*
		175	* we made progress, there is more work to do and the bdi
		176	* is now congested. Back off and let other work structs
		177	* run instead
		178	*/
		179	if (pending && num_run && bdi_write_congested(bdi)) {
		180	struct bio *old_head;
		181
		182	spin_lock(&device->io_lock);
		183	old_head = device->pending_bios;
		184	device->pending_bios = pending;
		185	if (device->pending_bio_tail)
		186	tail->bi_next = old_head;
		187	else
		188	device->pending_bio_tail = tail;
		189
		190	spin_unlock(&device->io_lock);
		191	btrfs_requeue_work(&device->work);
		192	goto done;
		193	}
		194	}
		195	if (again)
		196	goto loop;
		197	done:
		198	return 0;
		199	}
		200
		201	void pending_bios_fn(struct btrfs_work *work)
		202	{
		203	struct btrfs_device *device;
		204
		205	device = container_of(work, struct btrfs_device, work);
		206	run_scheduled_bios(device);
		207	}
		208
113	static int device_list_add(const char *path,	209	static int device_list_add(const char *path,
114	struct btrfs_super_block *disk_super,	210	struct btrfs_super_block *disk_super,
115	u64 devid, struct btrfs_fs_devices **fs_devices_ret)	211	u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
141	return -ENOMEM;	237	return -ENOMEM;
142	}	238	}
143	device->devid = devid;	239	device->devid = devid;
		240	device->work.func = pending_bios_fn;
144	memcpy(device->uuid, disk_super->dev_item.uuid,	241	memcpy(device->uuid, disk_super->dev_item.uuid,
145	BTRFS_UUID_SIZE);	242	BTRFS_UUID_SIZE);
146	device->barriers = 1;	243	device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root root, char device_path)
925	}	1022	}
926		1023
927	device->barriers = 1;	1024	device->barriers = 1;
		1025	device->work.func = pending_bios_fn;
928	generate_random_uuid(device->uuid);	1026	generate_random_uuid(device->uuid);
929	spin_lock_init(&device->io_lock);	1027	spin_lock_init(&device->io_lock);
930	device->name = kstrdup(device_path, GFP_NOFS);	1028	device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
1965	#endif	2063	#endif
1966	}	2064	}
1967		2065
		2066	struct async_sched {
		2067	struct bio *bio;
		2068	int rw;
		2069	struct btrfs_fs_info *info;
		2070	struct btrfs_work work;
		2071	};
		2072
		2073	/*
		2074	* see run_scheduled_bios for a description of why bios are collected for
		2075	* async submit.
		2076	*
		2077	* This will add one bio to the pending list for a device and make sure
		2078	* the work struct is scheduled.
		2079	*/
		2080	int schedule_bio(struct btrfs_root root, struct btrfs_device device,
		2081	int rw, struct bio *bio)
		2082	{
		2083	int should_queue = 1;
		2084
		2085	/* don't bother with additional async steps for reads, right now */
		2086	if (!(rw & (1 << BIO_RW))) {
		2087	submit_bio(rw, bio);
		2088	return 0;
		2089	}
		2090
		2091	/*
		2092	* nr_async_sumbits allows us to reliably return congestion to the
		2093	* higher layers. Otherwise, the async bio makes it appear we have
		2094	* made progress against dirty pages when we've really just put it
		2095	* on a queue for later
		2096	*/
		2097	atomic_inc(&root->fs_info->nr_async_submits);
		2098	bio->bi_next = NULL;
		2099	bio->bi_rw \|= rw;
		2100
		2101	spin_lock(&device->io_lock);
		2102
		2103	if (device->pending_bio_tail)
		2104	device->pending_bio_tail->bi_next = bio;
		2105
		2106	device->pending_bio_tail = bio;
		2107	if (!device->pending_bios)
		2108	device->pending_bios = bio;
		2109	if (device->running_pending)
		2110	should_queue = 0;
		2111
		2112	spin_unlock(&device->io_lock);
		2113
		2114	if (should_queue)
		2115	btrfs_queue_worker(&root->fs_info->workers, &device->work);
		2116	return 0;
		2117	}
		2118
1968	int btrfs_map_bio(struct btrfs_root root, int rw, struct bio bio,	2119	int btrfs_map_bio(struct btrfs_root root, int rw, struct bio bio,
1969	int mirror_num)	2120	int mirror_num, int async_submit)
1970	{	2121	{
1971	struct btrfs_mapping_tree *map_tree;	2122	struct btrfs_mapping_tree *map_tree;
1972	struct btrfs_device *dev;	2123	struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root root, int rw, struct bio bio,
2012	dev = multi->stripes[dev_nr].dev;	2163	dev = multi->stripes[dev_nr].dev;
2013	if (dev && dev->bdev) {	2164	if (dev && dev->bdev) {
2014	bio->bi_bdev = dev->bdev;	2165	bio->bi_bdev = dev->bdev;
2015	spin_lock(&dev->io_lock);	2166	if (async_submit)
2016	dev->total_ios++;	2167	schedule_bio(root, dev, rw, bio);
2017	spin_unlock(&dev->io_lock);	2168	else
2018	submit_bio(rw, bio);	2169	submit_bio(rw, bio);
2019	} else {	2170	} else {
2020	bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;	2171	bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2021	bio->bi_sector = logical >> 9;	2172	bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device add_missing_dev(struct btrfs_root root,
2054	device->barriers = 1;	2205	device->barriers = 1;
2055	device->dev_root = root->fs_info->dev_root;	2206	device->dev_root = root->fs_info->dev_root;
2056	device->devid = devid;	2207	device->devid = devid;
		2208	device->work.func = pending_bios_fn;
2057	fs_devices->num_devices++;	2209	fs_devices->num_devices++;
2058	spin_lock_init(&device->io_lock);	2210	spin_lock_init(&device->io_lock);
2059	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);	2211	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);