aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-06-11 16:50:36 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:03 -0400
commit8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/volumes.c
parent43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)
Btrfs: Add async worker threads for pre and post IO checksumming
Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c162
1 files changed, 157 insertions, 5 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 722eb4550154..c57458ce6339 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
27#include "transaction.h" 27#include "transaction.h"
28#include "print-tree.h" 28#include "print-tree.h"
29#include "volumes.h" 29#include "volumes.h"
30#include "async-thread.h"
30 31
31struct map_lookup { 32struct map_lookup {
32 u64 type; 33 u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
110 return NULL; 111 return NULL;
111} 112}
112 113
114/*
115 * we try to collect pending bios for a device so we don't get a large
116 * number of procs sending bios down to the same device. This greatly
117 * improves the schedulers ability to collect and merge the bios.
118 *
119 * But, it also turns into a long list of bios to process and that is sure
120 * to eventually make the worker thread block. The solution here is to
121 * make some progress and then put this work struct back at the end of
122 * the list if the block device is congested. This way, multiple devices
123 * can make progress from a single worker thread.
124 */
125int run_scheduled_bios(struct btrfs_device *device)
126{
127 struct bio *pending;
128 struct backing_dev_info *bdi;
129 struct bio *tail;
130 struct bio *cur;
131 int again = 0;
132 unsigned long num_run = 0;
133
134 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
135loop:
136 spin_lock(&device->io_lock);
137
138 /* take all the bios off the list at once and process them
139 * later on (without the lock held). But, remember the
140 * tail and other pointers so the bios can be properly reinserted
141 * into the list if we hit congestion
142 */
143 pending = device->pending_bios;
144 tail = device->pending_bio_tail;
145 WARN_ON(pending && !tail);
146 device->pending_bios = NULL;
147 device->pending_bio_tail = NULL;
148
149 /*
150 * if pending was null this time around, no bios need processing
151 * at all and we can stop. Otherwise it'll loop back up again
152 * and do an additional check so no bios are missed.
153 *
154 * device->running_pending is used to synchronize with the
155 * schedule_bio code.
156 */
157 if (pending) {
158 again = 1;
159 device->running_pending = 1;
160 } else {
161 again = 0;
162 device->running_pending = 0;
163 }
164 spin_unlock(&device->io_lock);
165
166 while(pending) {
167 cur = pending;
168 pending = pending->bi_next;
169 cur->bi_next = NULL;
170 atomic_dec(&device->dev_root->fs_info->nr_async_submits);
171 submit_bio(cur->bi_rw, cur);
172 num_run++;
173
174 /*
175 * we made progress, there is more work to do and the bdi
176 * is now congested. Back off and let other work structs
177 * run instead
178 */
179 if (pending && num_run && bdi_write_congested(bdi)) {
180 struct bio *old_head;
181
182 spin_lock(&device->io_lock);
183 old_head = device->pending_bios;
184 device->pending_bios = pending;
185 if (device->pending_bio_tail)
186 tail->bi_next = old_head;
187 else
188 device->pending_bio_tail = tail;
189
190 spin_unlock(&device->io_lock);
191 btrfs_requeue_work(&device->work);
192 goto done;
193 }
194 }
195 if (again)
196 goto loop;
197done:
198 return 0;
199}
200
201void pending_bios_fn(struct btrfs_work *work)
202{
203 struct btrfs_device *device;
204
205 device = container_of(work, struct btrfs_device, work);
206 run_scheduled_bios(device);
207}
208
113static int device_list_add(const char *path, 209static int device_list_add(const char *path,
114 struct btrfs_super_block *disk_super, 210 struct btrfs_super_block *disk_super,
115 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 211 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
141 return -ENOMEM; 237 return -ENOMEM;
142 } 238 }
143 device->devid = devid; 239 device->devid = devid;
240 device->work.func = pending_bios_fn;
144 memcpy(device->uuid, disk_super->dev_item.uuid, 241 memcpy(device->uuid, disk_super->dev_item.uuid,
145 BTRFS_UUID_SIZE); 242 BTRFS_UUID_SIZE);
146 device->barriers = 1; 243 device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
925 } 1022 }
926 1023
927 device->barriers = 1; 1024 device->barriers = 1;
1025 device->work.func = pending_bios_fn;
928 generate_random_uuid(device->uuid); 1026 generate_random_uuid(device->uuid);
929 spin_lock_init(&device->io_lock); 1027 spin_lock_init(&device->io_lock);
930 device->name = kstrdup(device_path, GFP_NOFS); 1028 device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
1965#endif 2063#endif
1966} 2064}
1967 2065
2066struct async_sched {
2067 struct bio *bio;
2068 int rw;
2069 struct btrfs_fs_info *info;
2070 struct btrfs_work work;
2071};
2072
2073/*
2074 * see run_scheduled_bios for a description of why bios are collected for
2075 * async submit.
2076 *
2077 * This will add one bio to the pending list for a device and make sure
2078 * the work struct is scheduled.
2079 */
2080int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
2081 int rw, struct bio *bio)
2082{
2083 int should_queue = 1;
2084
2085 /* don't bother with additional async steps for reads, right now */
2086 if (!(rw & (1 << BIO_RW))) {
2087 submit_bio(rw, bio);
2088 return 0;
2089 }
2090
2091 /*
2092 * nr_async_sumbits allows us to reliably return congestion to the
2093 * higher layers. Otherwise, the async bio makes it appear we have
2094 * made progress against dirty pages when we've really just put it
2095 * on a queue for later
2096 */
2097 atomic_inc(&root->fs_info->nr_async_submits);
2098 bio->bi_next = NULL;
2099 bio->bi_rw |= rw;
2100
2101 spin_lock(&device->io_lock);
2102
2103 if (device->pending_bio_tail)
2104 device->pending_bio_tail->bi_next = bio;
2105
2106 device->pending_bio_tail = bio;
2107 if (!device->pending_bios)
2108 device->pending_bios = bio;
2109 if (device->running_pending)
2110 should_queue = 0;
2111
2112 spin_unlock(&device->io_lock);
2113
2114 if (should_queue)
2115 btrfs_queue_worker(&root->fs_info->workers, &device->work);
2116 return 0;
2117}
2118
1968int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2119int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
1969 int mirror_num) 2120 int mirror_num, int async_submit)
1970{ 2121{
1971 struct btrfs_mapping_tree *map_tree; 2122 struct btrfs_mapping_tree *map_tree;
1972 struct btrfs_device *dev; 2123 struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2012 dev = multi->stripes[dev_nr].dev; 2163 dev = multi->stripes[dev_nr].dev;
2013 if (dev && dev->bdev) { 2164 if (dev && dev->bdev) {
2014 bio->bi_bdev = dev->bdev; 2165 bio->bi_bdev = dev->bdev;
2015 spin_lock(&dev->io_lock); 2166 if (async_submit)
2016 dev->total_ios++; 2167 schedule_bio(root, dev, rw, bio);
2017 spin_unlock(&dev->io_lock); 2168 else
2018 submit_bio(rw, bio); 2169 submit_bio(rw, bio);
2019 } else { 2170 } else {
2020 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 2171 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2021 bio->bi_sector = logical >> 9; 2172 bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2054 device->barriers = 1; 2205 device->barriers = 1;
2055 device->dev_root = root->fs_info->dev_root; 2206 device->dev_root = root->fs_info->dev_root;
2056 device->devid = devid; 2207 device->devid = devid;
2208 device->work.func = pending_bios_fn;
2057 fs_devices->num_devices++; 2209 fs_devices->num_devices++;
2058 spin_lock_init(&device->io_lock); 2210 spin_lock_init(&device->io_lock);
2059 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 2211 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);