aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-04-20 15:50:09 -0400
committerChris Mason <chris.mason@oracle.com>2009-04-20 15:53:08 -0400
commitffbd517d5a8c8e93ddd11046434fb029f3df73aa (patch)
tree9ec7b7f2efbb8950ca2654235a899398e82a68b5
parent0882e8dd3aad33eca41696d463bb896e6c8817eb (diff)
Btrfs: use WRITE_SYNC for synchronous writes
Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for writes we plan on waiting on in the near future. This patch mirrors recent changes in other filesystems and the generic code to use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for other latency critical writes. Btrfs uses async worker threads for checksumming before the write is done, and then again to actually submit the bios. The bio submission code just runs a per-device list of bios that need to be sent down the pipe. This list is split into low priority and high priority lists so the WRITE_SYNC IO happens first. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent_io.c44
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/volumes.c124
-rw-r--r--fs/btrfs/volumes.h13
5 files changed, 141 insertions, 46 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa8035f36..fec18b43c2c3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device,
2095 device->barriers = 0; 2095 device->barriers = 0;
2096 get_bh(bh); 2096 get_bh(bh);
2097 lock_buffer(bh); 2097 lock_buffer(bh);
2098 ret = submit_bh(WRITE, bh); 2098 ret = submit_bh(WRITE_SYNC, bh);
2099 } 2099 }
2100 } else { 2100 } else {
2101 ret = submit_bh(WRITE, bh); 2101 ret = submit_bh(WRITE_SYNC, bh);
2102 } 2102 }
2103 2103
2104 if (!ret && wait) { 2104 if (!ret && wait) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8b7fbf..483b6727aaaf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -50,7 +50,10 @@ struct extent_page_data {
50 /* tells writepage not to lock the state bits for this range 50 /* tells writepage not to lock the state bits for this range
51 * it still does the unlocking 51 * it still does the unlocking
52 */ 52 */
53 int extent_locked; 53 unsigned int extent_locked:1;
54
55 /* tells the submit_bio code to use a WRITE_SYNC */
56 unsigned int sync_io:1;
54}; 57};
55 58
56int __init extent_io_init(void) 59int __init extent_io_init(void)
@@ -2136,8 +2139,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 u64 delalloc_end; 2139 u64 delalloc_end;
2137 int page_started; 2140 int page_started;
2138 int compressed; 2141 int compressed;
2142 int write_flags;
2139 unsigned long nr_written = 0; 2143 unsigned long nr_written = 0;
2140 2144
2145 if (wbc->sync_mode == WB_SYNC_ALL)
2146 write_flags = WRITE_SYNC_PLUG;
2147 else
2148 write_flags = WRITE;
2149
2141 WARN_ON(!PageLocked(page)); 2150 WARN_ON(!PageLocked(page));
2142 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2151 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2143 if (page->index > end_index || 2152 if (page->index > end_index ||
@@ -2314,9 +2323,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2314 (unsigned long long)end); 2323 (unsigned long long)end);
2315 } 2324 }
2316 2325
2317 ret = submit_extent_page(WRITE, tree, page, sector, 2326 ret = submit_extent_page(write_flags, tree, page,
2318 iosize, pg_offset, bdev, 2327 sector, iosize, pg_offset,
2319 &epd->bio, max_nr, 2328 bdev, &epd->bio, max_nr,
2320 end_bio_extent_writepage, 2329 end_bio_extent_writepage,
2321 0, 0, 0); 2330 0, 0, 0);
2322 if (ret) 2331 if (ret)
@@ -2460,15 +2469,23 @@ retry:
2460 return ret; 2469 return ret;
2461} 2470}
2462 2471
2463static noinline void flush_write_bio(void *data) 2472static void flush_epd_write_bio(struct extent_page_data *epd)
2464{ 2473{
2465 struct extent_page_data *epd = data;
2466 if (epd->bio) { 2474 if (epd->bio) {
2467 submit_one_bio(WRITE, epd->bio, 0, 0); 2475 if (epd->sync_io)
2476 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2477 else
2478 submit_one_bio(WRITE, epd->bio, 0, 0);
2468 epd->bio = NULL; 2479 epd->bio = NULL;
2469 } 2480 }
2470} 2481}
2471 2482
2483static noinline void flush_write_bio(void *data)
2484{
2485 struct extent_page_data *epd = data;
2486 flush_epd_write_bio(epd);
2487}
2488
2472int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2489int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2473 get_extent_t *get_extent, 2490 get_extent_t *get_extent,
2474 struct writeback_control *wbc) 2491 struct writeback_control *wbc)
@@ -2480,6 +2497,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2480 .tree = tree, 2497 .tree = tree,
2481 .get_extent = get_extent, 2498 .get_extent = get_extent,
2482 .extent_locked = 0, 2499 .extent_locked = 0,
2500 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2483 }; 2501 };
2484 struct writeback_control wbc_writepages = { 2502 struct writeback_control wbc_writepages = {
2485 .bdi = wbc->bdi, 2503 .bdi = wbc->bdi,
@@ -2490,13 +2508,11 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2490 .range_end = (loff_t)-1, 2508 .range_end = (loff_t)-1,
2491 }; 2509 };
2492 2510
2493
2494 ret = __extent_writepage(page, wbc, &epd); 2511 ret = __extent_writepage(page, wbc, &epd);
2495 2512
2496 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2513 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2497 __extent_writepage, &epd, flush_write_bio); 2514 __extent_writepage, &epd, flush_write_bio);
2498 if (epd.bio) 2515 flush_epd_write_bio(&epd);
2499 submit_one_bio(WRITE, epd.bio, 0, 0);
2500 return ret; 2516 return ret;
2501} 2517}
2502 2518
@@ -2515,6 +2531,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2515 .tree = tree, 2531 .tree = tree,
2516 .get_extent = get_extent, 2532 .get_extent = get_extent,
2517 .extent_locked = 1, 2533 .extent_locked = 1,
2534 .sync_io = mode == WB_SYNC_ALL,
2518 }; 2535 };
2519 struct writeback_control wbc_writepages = { 2536 struct writeback_control wbc_writepages = {
2520 .bdi = inode->i_mapping->backing_dev_info, 2537 .bdi = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2557,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2540 start += PAGE_CACHE_SIZE; 2557 start += PAGE_CACHE_SIZE;
2541 } 2558 }
2542 2559
2543 if (epd.bio) 2560 flush_epd_write_bio(&epd);
2544 submit_one_bio(WRITE, epd.bio, 0, 0);
2545 return ret; 2561 return ret;
2546} 2562}
2547 2563
@@ -2556,13 +2572,13 @@ int extent_writepages(struct extent_io_tree *tree,
2556 .tree = tree, 2572 .tree = tree,
2557 .get_extent = get_extent, 2573 .get_extent = get_extent,
2558 .extent_locked = 0, 2574 .extent_locked = 0,
2575 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2559 }; 2576 };
2560 2577
2561 ret = extent_write_cache_pages(tree, mapping, wbc, 2578 ret = extent_write_cache_pages(tree, mapping, wbc,
2562 __extent_writepage, &epd, 2579 __extent_writepage, &epd,
2563 flush_write_bio); 2580 flush_write_bio);
2564 if (epd.bio) 2581 flush_epd_write_bio(&epd);
2565 submit_one_bio(WRITE, epd.bio, 0, 0);
2566 return ret; 2582 return ret;
2567} 2583}
2568 2584
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b197d70..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
489 /* start IO across the range first to instantiate any delalloc 489 /* start IO across the range first to instantiate any delalloc
490 * extents 490 * extents
491 */ 491 */
492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
493 493
494 /* The compression code will leave pages locked but return from 494 /* The compression code will leave pages locked but return from
495 * writepage without setting the page writeback. Starting again 495 * writepage without setting the page writeback. Starting again
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e469728..e53835b88594 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
125 return NULL; 125 return NULL;
126} 126}
127 127
128static void requeue_list(struct btrfs_pending_bios *pending_bios,
129 struct bio *head, struct bio *tail)
130{
131
132 struct bio *old_head;
133
134 old_head = pending_bios->head;
135 pending_bios->head = head;
136 if (pending_bios->tail)
137 tail->bi_next = old_head;
138 else
139 pending_bios->tail = tail;
140}
141
128/* 142/*
129 * we try to collect pending bios for a device so we don't get a large 143 * we try to collect pending bios for a device so we don't get a large
130 * number of procs sending bios down to the same device. This greatly 144 * number of procs sending bios down to the same device. This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
141 struct bio *pending; 155 struct bio *pending;
142 struct backing_dev_info *bdi; 156 struct backing_dev_info *bdi;
143 struct btrfs_fs_info *fs_info; 157 struct btrfs_fs_info *fs_info;
158 struct btrfs_pending_bios *pending_bios;
144 struct bio *tail; 159 struct bio *tail;
145 struct bio *cur; 160 struct bio *cur;
146 int again = 0; 161 int again = 0;
147 unsigned long num_run = 0; 162 unsigned long num_run;
163 unsigned long num_sync_run;
148 unsigned long limit; 164 unsigned long limit;
149 unsigned long last_waited = 0; 165 unsigned long last_waited = 0;
150 166
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
153 limit = btrfs_async_submit_limit(fs_info); 169 limit = btrfs_async_submit_limit(fs_info);
154 limit = limit * 2 / 3; 170 limit = limit * 2 / 3;
155 171
172 /* we want to make sure that every time we switch from the sync
173 * list to the normal list, we unplug
174 */
175 num_sync_run = 0;
176
156loop: 177loop:
157 spin_lock(&device->io_lock); 178 spin_lock(&device->io_lock);
179 num_run = 0;
158 180
159loop_lock: 181loop_lock:
182
160 /* take all the bios off the list at once and process them 183 /* take all the bios off the list at once and process them
161 * later on (without the lock held). But, remember the 184 * later on (without the lock held). But, remember the
162 * tail and other pointers so the bios can be properly reinserted 185 * tail and other pointers so the bios can be properly reinserted
163 * into the list if we hit congestion 186 * into the list if we hit congestion
164 */ 187 */
165 pending = device->pending_bios; 188 if (device->pending_sync_bios.head)
166 tail = device->pending_bio_tail; 189 pending_bios = &device->pending_sync_bios;
190 else
191 pending_bios = &device->pending_bios;
192
193 pending = pending_bios->head;
194 tail = pending_bios->tail;
167 WARN_ON(pending && !tail); 195 WARN_ON(pending && !tail);
168 device->pending_bios = NULL;
169 device->pending_bio_tail = NULL;
170 196
171 /* 197 /*
172 * if pending was null this time around, no bios need processing 198 * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
176 * device->running_pending is used to synchronize with the 202 * device->running_pending is used to synchronize with the
177 * schedule_bio code. 203 * schedule_bio code.
178 */ 204 */
179 if (pending) { 205 if (device->pending_sync_bios.head == NULL &&
180 again = 1; 206 device->pending_bios.head == NULL) {
181 device->running_pending = 1;
182 } else {
183 again = 0; 207 again = 0;
184 device->running_pending = 0; 208 device->running_pending = 0;
209 } else {
210 again = 1;
211 device->running_pending = 1;
185 } 212 }
213
214 pending_bios->head = NULL;
215 pending_bios->tail = NULL;
216
186 spin_unlock(&device->io_lock); 217 spin_unlock(&device->io_lock);
187 218
219 /*
220 * if we're doing the regular priority list, make sure we unplug
221 * for any high prio bios we've sent down
222 */
223 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
224 num_sync_run = 0;
225 blk_run_backing_dev(bdi, NULL);
226 }
227
188 while (pending) { 228 while (pending) {
229
230 rmb();
231 if (pending_bios != &device->pending_sync_bios &&
232 device->pending_sync_bios.head &&
233 num_run > 16) {
234 cond_resched();
235 spin_lock(&device->io_lock);
236 requeue_list(pending_bios, pending, tail);
237 goto loop_lock;
238 }
239
189 cur = pending; 240 cur = pending;
190 pending = pending->bi_next; 241 pending = pending->bi_next;
191 cur->bi_next = NULL; 242 cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
196 wake_up(&fs_info->async_submit_wait); 247 wake_up(&fs_info->async_submit_wait);
197 248
198 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 249 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
199 bio_get(cur);
200 submit_bio(cur->bi_rw, cur); 250 submit_bio(cur->bi_rw, cur);
201 bio_put(cur);
202 num_run++; 251 num_run++;
252 if (bio_sync(cur))
253 num_sync_run++;
254
255 if (need_resched()) {
256 if (num_sync_run) {
257 blk_run_backing_dev(bdi, NULL);
258 num_sync_run = 0;
259 }
260 cond_resched();
261 }
203 262
204 /* 263 /*
205 * we made progress, there is more work to do and the bdi 264 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
208 */ 267 */
209 if (pending && bdi_write_congested(bdi) && num_run > 16 && 268 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
210 fs_info->fs_devices->open_devices > 1) { 269 fs_info->fs_devices->open_devices > 1) {
211 struct bio *old_head;
212 struct io_context *ioc; 270 struct io_context *ioc;
213 271
214 ioc = current->io_context; 272 ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
233 * against it before looping 291 * against it before looping
234 */ 292 */
235 last_waited = ioc->last_waited; 293 last_waited = ioc->last_waited;
294 if (need_resched()) {
295 if (num_sync_run) {
296 blk_run_backing_dev(bdi, NULL);
297 num_sync_run = 0;
298 }
299 cond_resched();
300 }
236 continue; 301 continue;
237 } 302 }
238 spin_lock(&device->io_lock); 303 spin_lock(&device->io_lock);
239 304 requeue_list(pending_bios, pending, tail);
240 old_head = device->pending_bios;
241 device->pending_bios = pending;
242 if (device->pending_bio_tail)
243 tail->bi_next = old_head;
244 else
245 device->pending_bio_tail = tail;
246
247 device->running_pending = 1; 305 device->running_pending = 1;
248 306
249 spin_unlock(&device->io_lock); 307 spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
251 goto done; 309 goto done;
252 } 310 }
253 } 311 }
312
313 if (num_sync_run) {
314 num_sync_run = 0;
315 blk_run_backing_dev(bdi, NULL);
316 }
317
318 cond_resched();
254 if (again) 319 if (again)
255 goto loop; 320 goto loop;
256 321
257 spin_lock(&device->io_lock); 322 spin_lock(&device->io_lock);
258 if (device->pending_bios) 323 if (device->pending_bios.head || device->pending_sync_bios.head)
259 goto loop_lock; 324 goto loop_lock;
260 spin_unlock(&device->io_lock); 325 spin_unlock(&device->io_lock);
261 326
@@ -2497,7 +2562,7 @@ again:
2497 max_errors = 1; 2562 max_errors = 1;
2498 } 2563 }
2499 } 2564 }
2500 if (multi_ret && rw == WRITE && 2565 if (multi_ret && (rw & (1 << BIO_RW)) &&
2501 stripes_allocated < stripes_required) { 2566 stripes_allocated < stripes_required) {
2502 stripes_allocated = map->num_stripes; 2567 stripes_allocated = map->num_stripes;
2503 free_extent_map(em); 2568 free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2762 int rw, struct bio *bio) 2827 int rw, struct bio *bio)
2763{ 2828{
2764 int should_queue = 1; 2829 int should_queue = 1;
2830 struct btrfs_pending_bios *pending_bios;
2765 2831
2766 /* don't bother with additional async steps for reads, right now */ 2832 /* don't bother with additional async steps for reads, right now */
2767 if (!(rw & (1 << BIO_RW))) { 2833 if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
2783 bio->bi_rw |= rw; 2849 bio->bi_rw |= rw;
2784 2850
2785 spin_lock(&device->io_lock); 2851 spin_lock(&device->io_lock);
2852 if (bio_sync(bio))
2853 pending_bios = &device->pending_sync_bios;
2854 else
2855 pending_bios = &device->pending_bios;
2786 2856
2787 if (device->pending_bio_tail) 2857 if (pending_bios->tail)
2788 device->pending_bio_tail->bi_next = bio; 2858 pending_bios->tail->bi_next = bio;
2789 2859
2790 device->pending_bio_tail = bio; 2860 pending_bios->tail = bio;
2791 if (!device->pending_bios) 2861 if (!pending_bios->head)
2792 device->pending_bios = bio; 2862 pending_bios->head = bio;
2793 if (device->running_pending) 2863 if (device->running_pending)
2794 should_queue = 0; 2864 should_queue = 0;
2795 2865
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de72ff7d..5836327ba5dd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
23#include "async-thread.h" 23#include "async-thread.h"
24 24
25struct buffer_head; 25struct buffer_head;
26struct btrfs_pending_bios {
27 struct bio *head;
28 struct bio *tail;
29};
30
26struct btrfs_device { 31struct btrfs_device {
27 struct list_head dev_list; 32 struct list_head dev_list;
28 struct list_head dev_alloc_list; 33 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices; 34 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root; 35 struct btrfs_root *dev_root;
31 struct bio *pending_bios; 36
32 struct bio *pending_bio_tail; 37 /* regular prio bios */
38 struct btrfs_pending_bios pending_bios;
39 /* WRITE_SYNC bios */
40 struct btrfs_pending_bios pending_sync_bios;
41
33 int running_pending; 42 int running_pending;
34 u64 generation; 43 u64 generation;
35 44