diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-21 17:12:58 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-21 17:12:58 -0400 |
commit | ccc5ff94c66e628d3c501b26ace5d4339667715d (patch) | |
tree | 41ca2f1552864cc86bd5735c1b05d0de2898bb05 /fs | |
parent | c19c6c32dcccfc89216bd579c0cb12d2dd45098f (diff) | |
parent | 546888da82082555a56528730a83f0afd12f33bf (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: fix btrfs fallocate oops and deadlock
Btrfs: use the right node in reada_for_balance
Btrfs: fix oops on page->mapping->host during writepage
Btrfs: add a priority queue to the async thread helpers
Btrfs: use WRITE_SYNC for synchronous writes
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/async-thread.c | 60 | ||||
-rw-r--r-- | fs/btrfs/async-thread.h | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 17 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 9 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 86 | ||||
-rw-r--r-- | fs/btrfs/file.c | 6 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 36 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 2 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 124 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 13 |
10 files changed, 272 insertions, 83 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 51bfdfc8fcda..502c3d61de62 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #define WORK_QUEUED_BIT 0 | 25 | #define WORK_QUEUED_BIT 0 |
26 | #define WORK_DONE_BIT 1 | 26 | #define WORK_DONE_BIT 1 |
27 | #define WORK_ORDER_DONE_BIT 2 | 27 | #define WORK_ORDER_DONE_BIT 2 |
28 | #define WORK_HIGH_PRIO_BIT 3 | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * container for the kthread task pointer and the list of pending work | 31 | * container for the kthread task pointer and the list of pending work |
@@ -36,6 +37,7 @@ struct btrfs_worker_thread { | |||
36 | 37 | ||
37 | /* list of struct btrfs_work that are waiting for service */ | 38 | /* list of struct btrfs_work that are waiting for service */ |
38 | struct list_head pending; | 39 | struct list_head pending; |
40 | struct list_head prio_pending; | ||
39 | 41 | ||
40 | /* list of worker threads from struct btrfs_workers */ | 42 | /* list of worker threads from struct btrfs_workers */ |
41 | struct list_head worker_list; | 43 | struct list_head worker_list; |
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, | |||
103 | 105 | ||
104 | spin_lock_irqsave(&workers->lock, flags); | 106 | spin_lock_irqsave(&workers->lock, flags); |
105 | 107 | ||
106 | while (!list_empty(&workers->order_list)) { | 108 | while (1) { |
107 | work = list_entry(workers->order_list.next, | 109 | if (!list_empty(&workers->prio_order_list)) { |
108 | struct btrfs_work, order_list); | 110 | work = list_entry(workers->prio_order_list.next, |
109 | 111 | struct btrfs_work, order_list); | |
112 | } else if (!list_empty(&workers->order_list)) { | ||
113 | work = list_entry(workers->order_list.next, | ||
114 | struct btrfs_work, order_list); | ||
115 | } else { | ||
116 | break; | ||
117 | } | ||
110 | if (!test_bit(WORK_DONE_BIT, &work->flags)) | 118 | if (!test_bit(WORK_DONE_BIT, &work->flags)) |
111 | break; | 119 | break; |
112 | 120 | ||
@@ -143,8 +151,14 @@ static int worker_loop(void *arg) | |||
143 | do { | 151 | do { |
144 | spin_lock_irq(&worker->lock); | 152 | spin_lock_irq(&worker->lock); |
145 | again_locked: | 153 | again_locked: |
146 | while (!list_empty(&worker->pending)) { | 154 | while (1) { |
147 | cur = worker->pending.next; | 155 | if (!list_empty(&worker->prio_pending)) |
156 | cur = worker->prio_pending.next; | ||
157 | else if (!list_empty(&worker->pending)) | ||
158 | cur = worker->pending.next; | ||
159 | else | ||
160 | break; | ||
161 | |||
148 | work = list_entry(cur, struct btrfs_work, list); | 162 | work = list_entry(cur, struct btrfs_work, list); |
149 | list_del(&work->list); | 163 | list_del(&work->list); |
150 | clear_bit(WORK_QUEUED_BIT, &work->flags); | 164 | clear_bit(WORK_QUEUED_BIT, &work->flags); |
@@ -163,7 +177,6 @@ again_locked: | |||
163 | 177 | ||
164 | spin_lock_irq(&worker->lock); | 178 | spin_lock_irq(&worker->lock); |
165 | check_idle_worker(worker); | 179 | check_idle_worker(worker); |
166 | |||
167 | } | 180 | } |
168 | if (freezing(current)) { | 181 | if (freezing(current)) { |
169 | worker->working = 0; | 182 | worker->working = 0; |
@@ -178,7 +191,8 @@ again_locked: | |||
178 | * jump_in? | 191 | * jump_in? |
179 | */ | 192 | */ |
180 | smp_mb(); | 193 | smp_mb(); |
181 | if (!list_empty(&worker->pending)) | 194 | if (!list_empty(&worker->pending) || |
195 | !list_empty(&worker->prio_pending)) | ||
182 | continue; | 196 | continue; |
183 | 197 | ||
184 | /* | 198 | /* |
@@ -191,7 +205,8 @@ again_locked: | |||
191 | */ | 205 | */ |
192 | schedule_timeout(1); | 206 | schedule_timeout(1); |
193 | smp_mb(); | 207 | smp_mb(); |
194 | if (!list_empty(&worker->pending)) | 208 | if (!list_empty(&worker->pending) || |
209 | !list_empty(&worker->prio_pending)) | ||
195 | continue; | 210 | continue; |
196 | 211 | ||
197 | if (kthread_should_stop()) | 212 | if (kthread_should_stop()) |
@@ -200,7 +215,8 @@ again_locked: | |||
200 | /* still no more work?, sleep for real */ | 215 | /* still no more work?, sleep for real */ |
201 | spin_lock_irq(&worker->lock); | 216 | spin_lock_irq(&worker->lock); |
202 | set_current_state(TASK_INTERRUPTIBLE); | 217 | set_current_state(TASK_INTERRUPTIBLE); |
203 | if (!list_empty(&worker->pending)) | 218 | if (!list_empty(&worker->pending) || |
219 | !list_empty(&worker->prio_pending)) | ||
204 | goto again_locked; | 220 | goto again_locked; |
205 | 221 | ||
206 | /* | 222 | /* |
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) | |||
248 | INIT_LIST_HEAD(&workers->worker_list); | 264 | INIT_LIST_HEAD(&workers->worker_list); |
249 | INIT_LIST_HEAD(&workers->idle_list); | 265 | INIT_LIST_HEAD(&workers->idle_list); |
250 | INIT_LIST_HEAD(&workers->order_list); | 266 | INIT_LIST_HEAD(&workers->order_list); |
267 | INIT_LIST_HEAD(&workers->prio_order_list); | ||
251 | spin_lock_init(&workers->lock); | 268 | spin_lock_init(&workers->lock); |
252 | workers->max_workers = max; | 269 | workers->max_workers = max; |
253 | workers->idle_thresh = 32; | 270 | workers->idle_thresh = 32; |
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) | |||
273 | } | 290 | } |
274 | 291 | ||
275 | INIT_LIST_HEAD(&worker->pending); | 292 | INIT_LIST_HEAD(&worker->pending); |
293 | INIT_LIST_HEAD(&worker->prio_pending); | ||
276 | INIT_LIST_HEAD(&worker->worker_list); | 294 | INIT_LIST_HEAD(&worker->worker_list); |
277 | spin_lock_init(&worker->lock); | 295 | spin_lock_init(&worker->lock); |
278 | atomic_set(&worker->num_pending, 0); | 296 | atomic_set(&worker->num_pending, 0); |
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work) | |||
396 | goto out; | 414 | goto out; |
397 | 415 | ||
398 | spin_lock_irqsave(&worker->lock, flags); | 416 | spin_lock_irqsave(&worker->lock, flags); |
399 | list_add_tail(&work->list, &worker->pending); | 417 | if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) |
418 | list_add_tail(&work->list, &worker->prio_pending); | ||
419 | else | ||
420 | list_add_tail(&work->list, &worker->pending); | ||
400 | atomic_inc(&worker->num_pending); | 421 | atomic_inc(&worker->num_pending); |
401 | 422 | ||
402 | /* by definition we're busy, take ourselves off the idle | 423 | /* by definition we're busy, take ourselves off the idle |
@@ -422,6 +443,11 @@ out: | |||
422 | return 0; | 443 | return 0; |
423 | } | 444 | } |
424 | 445 | ||
446 | void btrfs_set_work_high_prio(struct btrfs_work *work) | ||
447 | { | ||
448 | set_bit(WORK_HIGH_PRIO_BIT, &work->flags); | ||
449 | } | ||
450 | |||
425 | /* | 451 | /* |
426 | * places a struct btrfs_work into the pending queue of one of the kthreads | 452 | * places a struct btrfs_work into the pending queue of one of the kthreads |
427 | */ | 453 | */ |
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) | |||
438 | worker = find_worker(workers); | 464 | worker = find_worker(workers); |
439 | if (workers->ordered) { | 465 | if (workers->ordered) { |
440 | spin_lock_irqsave(&workers->lock, flags); | 466 | spin_lock_irqsave(&workers->lock, flags); |
441 | list_add_tail(&work->order_list, &workers->order_list); | 467 | if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { |
468 | list_add_tail(&work->order_list, | ||
469 | &workers->prio_order_list); | ||
470 | } else { | ||
471 | list_add_tail(&work->order_list, &workers->order_list); | ||
472 | } | ||
442 | spin_unlock_irqrestore(&workers->lock, flags); | 473 | spin_unlock_irqrestore(&workers->lock, flags); |
443 | } else { | 474 | } else { |
444 | INIT_LIST_HEAD(&work->order_list); | 475 | INIT_LIST_HEAD(&work->order_list); |
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) | |||
446 | 477 | ||
447 | spin_lock_irqsave(&worker->lock, flags); | 478 | spin_lock_irqsave(&worker->lock, flags); |
448 | 479 | ||
449 | list_add_tail(&work->list, &worker->pending); | 480 | if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) |
481 | list_add_tail(&work->list, &worker->prio_pending); | ||
482 | else | ||
483 | list_add_tail(&work->list, &worker->pending); | ||
450 | atomic_inc(&worker->num_pending); | 484 | atomic_inc(&worker->num_pending); |
451 | check_busy_worker(worker); | 485 | check_busy_worker(worker); |
452 | 486 | ||
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 31be4ed8b63e..1b511c109db6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h | |||
@@ -85,6 +85,7 @@ struct btrfs_workers { | |||
85 | * of work items waiting for completion | 85 | * of work items waiting for completion |
86 | */ | 86 | */ |
87 | struct list_head order_list; | 87 | struct list_head order_list; |
88 | struct list_head prio_order_list; | ||
88 | 89 | ||
89 | /* lock for finding the next worker thread to queue on */ | 90 | /* lock for finding the next worker thread to queue on */ |
90 | spinlock_t lock; | 91 | spinlock_t lock; |
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); | |||
98 | int btrfs_stop_workers(struct btrfs_workers *workers); | 99 | int btrfs_stop_workers(struct btrfs_workers *workers); |
99 | void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); | 100 | void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); |
100 | int btrfs_requeue_work(struct btrfs_work *work); | 101 | int btrfs_requeue_work(struct btrfs_work *work); |
102 | void btrfs_set_work_high_prio(struct btrfs_work *work); | ||
101 | #endif | 103 | #endif |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e5b2533b691a..a99f1c2a710d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, | |||
1325 | int ret = 0; | 1325 | int ret = 0; |
1326 | int blocksize; | 1326 | int blocksize; |
1327 | 1327 | ||
1328 | parent = path->nodes[level - 1]; | 1328 | parent = path->nodes[level + 1]; |
1329 | if (!parent) | 1329 | if (!parent) |
1330 | return 0; | 1330 | return 0; |
1331 | 1331 | ||
1332 | nritems = btrfs_header_nritems(parent); | 1332 | nritems = btrfs_header_nritems(parent); |
1333 | slot = path->slots[level]; | 1333 | slot = path->slots[level + 1]; |
1334 | blocksize = btrfs_level_size(root, level); | 1334 | blocksize = btrfs_level_size(root, level); |
1335 | 1335 | ||
1336 | if (slot > 0) { | 1336 | if (slot > 0) { |
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, | |||
1341 | block1 = 0; | 1341 | block1 = 0; |
1342 | free_extent_buffer(eb); | 1342 | free_extent_buffer(eb); |
1343 | } | 1343 | } |
1344 | if (slot < nritems) { | 1344 | if (slot + 1 < nritems) { |
1345 | block2 = btrfs_node_blockptr(parent, slot + 1); | 1345 | block2 = btrfs_node_blockptr(parent, slot + 1); |
1346 | gen = btrfs_node_ptr_generation(parent, slot + 1); | 1346 | gen = btrfs_node_ptr_generation(parent, slot + 1); |
1347 | eb = btrfs_find_tree_block(root, block2, blocksize); | 1347 | eb = btrfs_find_tree_block(root, block2, blocksize); |
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, | |||
1351 | } | 1351 | } |
1352 | if (block1 || block2) { | 1352 | if (block1 || block2) { |
1353 | ret = -EAGAIN; | 1353 | ret = -EAGAIN; |
1354 | |||
1355 | /* release the whole path */ | ||
1354 | btrfs_release_path(root, path); | 1356 | btrfs_release_path(root, path); |
1357 | |||
1358 | /* read the blocks */ | ||
1355 | if (block1) | 1359 | if (block1) |
1356 | readahead_tree_block(root, block1, blocksize, 0); | 1360 | readahead_tree_block(root, block1, blocksize, 0); |
1357 | if (block2) | 1361 | if (block2) |
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, | |||
1361 | eb = read_tree_block(root, block1, blocksize, 0); | 1365 | eb = read_tree_block(root, block1, blocksize, 0); |
1362 | free_extent_buffer(eb); | 1366 | free_extent_buffer(eb); |
1363 | } | 1367 | } |
1364 | if (block1) { | 1368 | if (block2) { |
1365 | eb = read_tree_block(root, block2, blocksize, 0); | 1369 | eb = read_tree_block(root, block2, blocksize, 0); |
1366 | free_extent_buffer(eb); | 1370 | free_extent_buffer(eb); |
1367 | } | 1371 | } |
@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans, | |||
1481 | * of the btree by dropping locks before | 1485 | * of the btree by dropping locks before |
1482 | * we read. | 1486 | * we read. |
1483 | */ | 1487 | */ |
1484 | btrfs_release_path(NULL, p); | 1488 | btrfs_unlock_up_safe(p, level + 1); |
1489 | btrfs_set_path_blocking(p); | ||
1490 | |||
1485 | if (tmp) | 1491 | if (tmp) |
1486 | free_extent_buffer(tmp); | 1492 | free_extent_buffer(tmp); |
1487 | if (p->reada) | 1493 | if (p->reada) |
1488 | reada_for_search(root, p, level, slot, key->objectid); | 1494 | reada_for_search(root, p, level, slot, key->objectid); |
1489 | 1495 | ||
1496 | btrfs_release_path(NULL, p); | ||
1490 | tmp = read_tree_block(root, blocknr, blocksize, gen); | 1497 | tmp = read_tree_block(root, blocknr, blocksize, gen); |
1491 | if (tmp) | 1498 | if (tmp) |
1492 | free_extent_buffer(tmp); | 1499 | free_extent_buffer(tmp); |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92caa8035f36..a6b83744b05d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, | |||
579 | async->bio_flags = bio_flags; | 579 | async->bio_flags = bio_flags; |
580 | 580 | ||
581 | atomic_inc(&fs_info->nr_async_submits); | 581 | atomic_inc(&fs_info->nr_async_submits); |
582 | |||
583 | if (rw & (1 << BIO_RW_SYNCIO)) | ||
584 | btrfs_set_work_high_prio(&async->work); | ||
585 | |||
582 | btrfs_queue_worker(&fs_info->workers, &async->work); | 586 | btrfs_queue_worker(&fs_info->workers, &async->work); |
583 | #if 0 | 587 | #if 0 |
584 | int limit = btrfs_async_submit_limit(fs_info); | 588 | int limit = btrfs_async_submit_limit(fs_info); |
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
656 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, | 660 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, |
657 | mirror_num, 0); | 661 | mirror_num, 0); |
658 | } | 662 | } |
663 | |||
659 | /* | 664 | /* |
660 | * kthread helpers are used to submit writes so that checksumming | 665 | * kthread helpers are used to submit writes so that checksumming |
661 | * can happen in parallel across all CPUs | 666 | * can happen in parallel across all CPUs |
@@ -2095,10 +2100,10 @@ static int write_dev_supers(struct btrfs_device *device, | |||
2095 | device->barriers = 0; | 2100 | device->barriers = 0; |
2096 | get_bh(bh); | 2101 | get_bh(bh); |
2097 | lock_buffer(bh); | 2102 | lock_buffer(bh); |
2098 | ret = submit_bh(WRITE, bh); | 2103 | ret = submit_bh(WRITE_SYNC, bh); |
2099 | } | 2104 | } |
2100 | } else { | 2105 | } else { |
2101 | ret = submit_bh(WRITE, bh); | 2106 | ret = submit_bh(WRITE_SYNC, bh); |
2102 | } | 2107 | } |
2103 | 2108 | ||
2104 | if (!ret && wait) { | 2109 | if (!ret && wait) { |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb2bee8b7fbf..05a1c42e25bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -50,7 +50,10 @@ struct extent_page_data { | |||
50 | /* tells writepage not to lock the state bits for this range | 50 | /* tells writepage not to lock the state bits for this range |
51 | * it still does the unlocking | 51 | * it still does the unlocking |
52 | */ | 52 | */ |
53 | int extent_locked; | 53 | unsigned int extent_locked:1; |
54 | |||
55 | /* tells the submit_bio code to use a WRITE_SYNC */ | ||
56 | unsigned int sync_io:1; | ||
54 | }; | 57 | }; |
55 | 58 | ||
56 | int __init extent_io_init(void) | 59 | int __init extent_io_init(void) |
@@ -2101,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | |||
2101 | return ret; | 2104 | return ret; |
2102 | } | 2105 | } |
2103 | 2106 | ||
2107 | static noinline void update_nr_written(struct page *page, | ||
2108 | struct writeback_control *wbc, | ||
2109 | unsigned long nr_written) | ||
2110 | { | ||
2111 | wbc->nr_to_write -= nr_written; | ||
2112 | if (wbc->range_cyclic || (wbc->nr_to_write > 0 && | ||
2113 | wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) | ||
2114 | page->mapping->writeback_index = page->index + nr_written; | ||
2115 | } | ||
2116 | |||
2104 | /* | 2117 | /* |
2105 | * the writepage semantics are similar to regular writepage. extent | 2118 | * the writepage semantics are similar to regular writepage. extent |
2106 | * records are inserted to lock ranges in the tree, and as dirty areas | 2119 | * records are inserted to lock ranges in the tree, and as dirty areas |
@@ -2136,8 +2149,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2136 | u64 delalloc_end; | 2149 | u64 delalloc_end; |
2137 | int page_started; | 2150 | int page_started; |
2138 | int compressed; | 2151 | int compressed; |
2152 | int write_flags; | ||
2139 | unsigned long nr_written = 0; | 2153 | unsigned long nr_written = 0; |
2140 | 2154 | ||
2155 | if (wbc->sync_mode == WB_SYNC_ALL) | ||
2156 | write_flags = WRITE_SYNC_PLUG; | ||
2157 | else | ||
2158 | write_flags = WRITE; | ||
2159 | |||
2141 | WARN_ON(!PageLocked(page)); | 2160 | WARN_ON(!PageLocked(page)); |
2142 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | 2161 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); |
2143 | if (page->index > end_index || | 2162 | if (page->index > end_index || |
@@ -2164,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2164 | delalloc_end = 0; | 2183 | delalloc_end = 0; |
2165 | page_started = 0; | 2184 | page_started = 0; |
2166 | if (!epd->extent_locked) { | 2185 | if (!epd->extent_locked) { |
2186 | /* | ||
2187 | * make sure the wbc mapping index is at least updated | ||
2188 | * to this page. | ||
2189 | */ | ||
2190 | update_nr_written(page, wbc, 0); | ||
2191 | |||
2167 | while (delalloc_end < page_end) { | 2192 | while (delalloc_end < page_end) { |
2168 | nr_delalloc = find_lock_delalloc_range(inode, tree, | 2193 | nr_delalloc = find_lock_delalloc_range(inode, tree, |
2169 | page, | 2194 | page, |
@@ -2185,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2185 | */ | 2210 | */ |
2186 | if (page_started) { | 2211 | if (page_started) { |
2187 | ret = 0; | 2212 | ret = 0; |
2188 | goto update_nr_written; | 2213 | /* |
2214 | * we've unlocked the page, so we can't update | ||
2215 | * the mapping's writeback index, just update | ||
2216 | * nr_to_write. | ||
2217 | */ | ||
2218 | wbc->nr_to_write -= nr_written; | ||
2219 | goto done_unlocked; | ||
2189 | } | 2220 | } |
2190 | } | 2221 | } |
2191 | lock_extent(tree, start, page_end, GFP_NOFS); | 2222 | lock_extent(tree, start, page_end, GFP_NOFS); |
@@ -2198,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2198 | if (ret == -EAGAIN) { | 2229 | if (ret == -EAGAIN) { |
2199 | unlock_extent(tree, start, page_end, GFP_NOFS); | 2230 | unlock_extent(tree, start, page_end, GFP_NOFS); |
2200 | redirty_page_for_writepage(wbc, page); | 2231 | redirty_page_for_writepage(wbc, page); |
2232 | update_nr_written(page, wbc, nr_written); | ||
2201 | unlock_page(page); | 2233 | unlock_page(page); |
2202 | ret = 0; | 2234 | ret = 0; |
2203 | goto update_nr_written; | 2235 | goto done_unlocked; |
2204 | } | 2236 | } |
2205 | } | 2237 | } |
2206 | 2238 | ||
2207 | nr_written++; | 2239 | /* |
2240 | * we don't want to touch the inode after unlocking the page, | ||
2241 | * so we update the mapping writeback index now | ||
2242 | */ | ||
2243 | update_nr_written(page, wbc, nr_written + 1); | ||
2208 | 2244 | ||
2209 | end = page_end; | 2245 | end = page_end; |
2210 | if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) | 2246 | if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) |
@@ -2314,9 +2350,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2314 | (unsigned long long)end); | 2350 | (unsigned long long)end); |
2315 | } | 2351 | } |
2316 | 2352 | ||
2317 | ret = submit_extent_page(WRITE, tree, page, sector, | 2353 | ret = submit_extent_page(write_flags, tree, page, |
2318 | iosize, pg_offset, bdev, | 2354 | sector, iosize, pg_offset, |
2319 | &epd->bio, max_nr, | 2355 | bdev, &epd->bio, max_nr, |
2320 | end_bio_extent_writepage, | 2356 | end_bio_extent_writepage, |
2321 | 0, 0, 0); | 2357 | 0, 0, 0); |
2322 | if (ret) | 2358 | if (ret) |
@@ -2336,11 +2372,8 @@ done: | |||
2336 | unlock_extent(tree, unlock_start, page_end, GFP_NOFS); | 2372 | unlock_extent(tree, unlock_start, page_end, GFP_NOFS); |
2337 | unlock_page(page); | 2373 | unlock_page(page); |
2338 | 2374 | ||
2339 | update_nr_written: | 2375 | done_unlocked: |
2340 | wbc->nr_to_write -= nr_written; | 2376 | |
2341 | if (wbc->range_cyclic || (wbc->nr_to_write > 0 && | ||
2342 | wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) | ||
2343 | page->mapping->writeback_index = page->index + nr_written; | ||
2344 | return 0; | 2377 | return 0; |
2345 | } | 2378 | } |
2346 | 2379 | ||
@@ -2460,15 +2493,23 @@ retry: | |||
2460 | return ret; | 2493 | return ret; |
2461 | } | 2494 | } |
2462 | 2495 | ||
2463 | static noinline void flush_write_bio(void *data) | 2496 | static void flush_epd_write_bio(struct extent_page_data *epd) |
2464 | { | 2497 | { |
2465 | struct extent_page_data *epd = data; | ||
2466 | if (epd->bio) { | 2498 | if (epd->bio) { |
2467 | submit_one_bio(WRITE, epd->bio, 0, 0); | 2499 | if (epd->sync_io) |
2500 | submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); | ||
2501 | else | ||
2502 | submit_one_bio(WRITE, epd->bio, 0, 0); | ||
2468 | epd->bio = NULL; | 2503 | epd->bio = NULL; |
2469 | } | 2504 | } |
2470 | } | 2505 | } |
2471 | 2506 | ||
2507 | static noinline void flush_write_bio(void *data) | ||
2508 | { | ||
2509 | struct extent_page_data *epd = data; | ||
2510 | flush_epd_write_bio(epd); | ||
2511 | } | ||
2512 | |||
2472 | int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | 2513 | int extent_write_full_page(struct extent_io_tree *tree, struct page *page, |
2473 | get_extent_t *get_extent, | 2514 | get_extent_t *get_extent, |
2474 | struct writeback_control *wbc) | 2515 | struct writeback_control *wbc) |
@@ -2480,23 +2521,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | |||
2480 | .tree = tree, | 2521 | .tree = tree, |
2481 | .get_extent = get_extent, | 2522 | .get_extent = get_extent, |
2482 | .extent_locked = 0, | 2523 | .extent_locked = 0, |
2524 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, | ||
2483 | }; | 2525 | }; |
2484 | struct writeback_control wbc_writepages = { | 2526 | struct writeback_control wbc_writepages = { |
2485 | .bdi = wbc->bdi, | 2527 | .bdi = wbc->bdi, |
2486 | .sync_mode = WB_SYNC_NONE, | 2528 | .sync_mode = wbc->sync_mode, |
2487 | .older_than_this = NULL, | 2529 | .older_than_this = NULL, |
2488 | .nr_to_write = 64, | 2530 | .nr_to_write = 64, |
2489 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, | 2531 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, |
2490 | .range_end = (loff_t)-1, | 2532 | .range_end = (loff_t)-1, |
2491 | }; | 2533 | }; |
2492 | 2534 | ||
2493 | |||
2494 | ret = __extent_writepage(page, wbc, &epd); | 2535 | ret = __extent_writepage(page, wbc, &epd); |
2495 | 2536 | ||
2496 | extent_write_cache_pages(tree, mapping, &wbc_writepages, | 2537 | extent_write_cache_pages(tree, mapping, &wbc_writepages, |
2497 | __extent_writepage, &epd, flush_write_bio); | 2538 | __extent_writepage, &epd, flush_write_bio); |
2498 | if (epd.bio) | 2539 | flush_epd_write_bio(&epd); |
2499 | submit_one_bio(WRITE, epd.bio, 0, 0); | ||
2500 | return ret; | 2540 | return ret; |
2501 | } | 2541 | } |
2502 | 2542 | ||
@@ -2515,6 +2555,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, | |||
2515 | .tree = tree, | 2555 | .tree = tree, |
2516 | .get_extent = get_extent, | 2556 | .get_extent = get_extent, |
2517 | .extent_locked = 1, | 2557 | .extent_locked = 1, |
2558 | .sync_io = mode == WB_SYNC_ALL, | ||
2518 | }; | 2559 | }; |
2519 | struct writeback_control wbc_writepages = { | 2560 | struct writeback_control wbc_writepages = { |
2520 | .bdi = inode->i_mapping->backing_dev_info, | 2561 | .bdi = inode->i_mapping->backing_dev_info, |
@@ -2540,8 +2581,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, | |||
2540 | start += PAGE_CACHE_SIZE; | 2581 | start += PAGE_CACHE_SIZE; |
2541 | } | 2582 | } |
2542 | 2583 | ||
2543 | if (epd.bio) | 2584 | flush_epd_write_bio(&epd); |
2544 | submit_one_bio(WRITE, epd.bio, 0, 0); | ||
2545 | return ret; | 2585 | return ret; |
2546 | } | 2586 | } |
2547 | 2587 | ||
@@ -2556,13 +2596,13 @@ int extent_writepages(struct extent_io_tree *tree, | |||
2556 | .tree = tree, | 2596 | .tree = tree, |
2557 | .get_extent = get_extent, | 2597 | .get_extent = get_extent, |
2558 | .extent_locked = 0, | 2598 | .extent_locked = 0, |
2599 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, | ||
2559 | }; | 2600 | }; |
2560 | 2601 | ||
2561 | ret = extent_write_cache_pages(tree, mapping, wbc, | 2602 | ret = extent_write_cache_pages(tree, mapping, wbc, |
2562 | __extent_writepage, &epd, | 2603 | __extent_writepage, &epd, |
2563 | flush_write_bio); | 2604 | flush_write_bio); |
2564 | if (epd.bio) | 2605 | flush_epd_write_bio(&epd); |
2565 | submit_one_bio(WRITE, epd.bio, 0, 0); | ||
2566 | return ret; | 2606 | return ret; |
2567 | } | 2607 | } |
2568 | 2608 | ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c9fb46ccd08..482f8db2cfd0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -830,7 +830,7 @@ again: | |||
830 | 830 | ||
831 | ret = btrfs_del_items(trans, root, path, del_slot, del_nr); | 831 | ret = btrfs_del_items(trans, root, path, del_slot, del_nr); |
832 | BUG_ON(ret); | 832 | BUG_ON(ret); |
833 | goto done; | 833 | goto release; |
834 | } else if (split == start) { | 834 | } else if (split == start) { |
835 | if (locked_end < extent_end) { | 835 | if (locked_end < extent_end) { |
836 | ret = try_lock_extent(&BTRFS_I(inode)->io_tree, | 836 | ret = try_lock_extent(&BTRFS_I(inode)->io_tree, |
@@ -926,6 +926,8 @@ again: | |||
926 | } | 926 | } |
927 | done: | 927 | done: |
928 | btrfs_mark_buffer_dirty(leaf); | 928 | btrfs_mark_buffer_dirty(leaf); |
929 | |||
930 | release: | ||
929 | btrfs_release_path(root, path); | 931 | btrfs_release_path(root, path); |
930 | if (split_end && split == start) { | 932 | if (split_end && split == start) { |
931 | split = end; | 933 | split = end; |
@@ -1131,7 +1133,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
1131 | if (will_write) { | 1133 | if (will_write) { |
1132 | btrfs_fdatawrite_range(inode->i_mapping, pos, | 1134 | btrfs_fdatawrite_range(inode->i_mapping, pos, |
1133 | pos + write_bytes - 1, | 1135 | pos + write_bytes - 1, |
1134 | WB_SYNC_NONE); | 1136 | WB_SYNC_ALL); |
1135 | } else { | 1137 | } else { |
1136 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, | 1138 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, |
1137 | num_pages); | 1139 | num_pages); |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0d1dd492a58..65219f6a16a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -4970,10 +4970,10 @@ out_fail: | |||
4970 | return err; | 4970 | return err; |
4971 | } | 4971 | } |
4972 | 4972 | ||
4973 | static int prealloc_file_range(struct inode *inode, u64 start, u64 end, | 4973 | static int prealloc_file_range(struct btrfs_trans_handle *trans, |
4974 | struct inode *inode, u64 start, u64 end, | ||
4974 | u64 alloc_hint, int mode) | 4975 | u64 alloc_hint, int mode) |
4975 | { | 4976 | { |
4976 | struct btrfs_trans_handle *trans; | ||
4977 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4977 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4978 | struct btrfs_key ins; | 4978 | struct btrfs_key ins; |
4979 | u64 alloc_size; | 4979 | u64 alloc_size; |
@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, | |||
4981 | u64 num_bytes = end - start; | 4981 | u64 num_bytes = end - start; |
4982 | int ret = 0; | 4982 | int ret = 0; |
4983 | 4983 | ||
4984 | trans = btrfs_join_transaction(root, 1); | ||
4985 | BUG_ON(!trans); | ||
4986 | btrfs_set_trans_block_group(trans, inode); | ||
4987 | |||
4988 | while (num_bytes > 0) { | 4984 | while (num_bytes > 0) { |
4989 | alloc_size = min(num_bytes, root->fs_info->max_extent); | 4985 | alloc_size = min(num_bytes, root->fs_info->max_extent); |
4990 | ret = btrfs_reserve_extent(trans, root, alloc_size, | 4986 | ret = btrfs_reserve_extent(trans, root, alloc_size, |
@@ -5015,7 +5011,6 @@ out: | |||
5015 | BUG_ON(ret); | 5011 | BUG_ON(ret); |
5016 | } | 5012 | } |
5017 | 5013 | ||
5018 | btrfs_end_transaction(trans, root); | ||
5019 | return ret; | 5014 | return ret; |
5020 | } | 5015 | } |
5021 | 5016 | ||
@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode, | |||
5029 | u64 alloc_hint = 0; | 5024 | u64 alloc_hint = 0; |
5030 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; | 5025 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; |
5031 | struct extent_map *em; | 5026 | struct extent_map *em; |
5027 | struct btrfs_trans_handle *trans; | ||
5032 | int ret; | 5028 | int ret; |
5033 | 5029 | ||
5034 | alloc_start = offset & ~mask; | 5030 | alloc_start = offset & ~mask; |
5035 | alloc_end = (offset + len + mask) & ~mask; | 5031 | alloc_end = (offset + len + mask) & ~mask; |
5036 | 5032 | ||
5033 | /* | ||
5034 | * wait for ordered IO before we have any locks. We'll loop again | ||
5035 | * below with the locks held. | ||
5036 | */ | ||
5037 | btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); | ||
5038 | |||
5037 | mutex_lock(&inode->i_mutex); | 5039 | mutex_lock(&inode->i_mutex); |
5038 | if (alloc_start > inode->i_size) { | 5040 | if (alloc_start > inode->i_size) { |
5039 | ret = btrfs_cont_expand(inode, alloc_start); | 5041 | ret = btrfs_cont_expand(inode, alloc_start); |
@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, | |||
5043 | 5045 | ||
5044 | while (1) { | 5046 | while (1) { |
5045 | struct btrfs_ordered_extent *ordered; | 5047 | struct btrfs_ordered_extent *ordered; |
5048 | |||
5049 | trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); | ||
5050 | if (!trans) { | ||
5051 | ret = -EIO; | ||
5052 | goto out; | ||
5053 | } | ||
5054 | |||
5055 | /* the extent lock is ordered inside the running | ||
5056 | * transaction | ||
5057 | */ | ||
5046 | lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, | 5058 | lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, |
5047 | alloc_end - 1, GFP_NOFS); | 5059 | alloc_end - 1, GFP_NOFS); |
5048 | ordered = btrfs_lookup_first_ordered_extent(inode, | 5060 | ordered = btrfs_lookup_first_ordered_extent(inode, |
@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, | |||
5053 | btrfs_put_ordered_extent(ordered); | 5065 | btrfs_put_ordered_extent(ordered); |
5054 | unlock_extent(&BTRFS_I(inode)->io_tree, | 5066 | unlock_extent(&BTRFS_I(inode)->io_tree, |
5055 | alloc_start, alloc_end - 1, GFP_NOFS); | 5067 | alloc_start, alloc_end - 1, GFP_NOFS); |
5068 | btrfs_end_transaction(trans, BTRFS_I(inode)->root); | ||
5069 | |||
5070 | /* | ||
5071 | * we can't wait on the range with the transaction | ||
5072 | * running or with the extent lock held | ||
5073 | */ | ||
5056 | btrfs_wait_ordered_range(inode, alloc_start, | 5074 | btrfs_wait_ordered_range(inode, alloc_start, |
5057 | alloc_end - alloc_start); | 5075 | alloc_end - alloc_start); |
5058 | } else { | 5076 | } else { |
@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, | |||
5070 | last_byte = min(extent_map_end(em), alloc_end); | 5088 | last_byte = min(extent_map_end(em), alloc_end); |
5071 | last_byte = (last_byte + mask) & ~mask; | 5089 | last_byte = (last_byte + mask) & ~mask; |
5072 | if (em->block_start == EXTENT_MAP_HOLE) { | 5090 | if (em->block_start == EXTENT_MAP_HOLE) { |
5073 | ret = prealloc_file_range(inode, cur_offset, | 5091 | ret = prealloc_file_range(trans, inode, cur_offset, |
5074 | last_byte, alloc_hint, mode); | 5092 | last_byte, alloc_hint, mode); |
5075 | if (ret < 0) { | 5093 | if (ret < 0) { |
5076 | free_extent_map(em); | 5094 | free_extent_map(em); |
@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, | |||
5089 | } | 5107 | } |
5090 | unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, | 5108 | unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, |
5091 | GFP_NOFS); | 5109 | GFP_NOFS); |
5110 | |||
5111 | btrfs_end_transaction(trans, BTRFS_I(inode)->root); | ||
5092 | out: | 5112 | out: |
5093 | mutex_unlock(&inode->i_mutex); | 5113 | mutex_unlock(&inode->i_mutex); |
5094 | return ret; | 5114 | return ret; |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 53c87b197d70..d6f0806c682f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -489,7 +489,7 @@ again: | |||
489 | /* start IO across the range first to instantiate any delalloc | 489 | /* start IO across the range first to instantiate any delalloc |
490 | * extents | 490 | * extents |
491 | */ | 491 | */ |
492 | btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); | 492 | btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); |
493 | 493 | ||
494 | /* The compression code will leave pages locked but return from | 494 | /* The compression code will leave pages locked but return from |
495 | * writepage without setting the page writeback. Starting again | 495 | * writepage without setting the page writeback. Starting again |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e0913e469728..e53835b88594 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) | |||
125 | return NULL; | 125 | return NULL; |
126 | } | 126 | } |
127 | 127 | ||
128 | static void requeue_list(struct btrfs_pending_bios *pending_bios, | ||
129 | struct bio *head, struct bio *tail) | ||
130 | { | ||
131 | |||
132 | struct bio *old_head; | ||
133 | |||
134 | old_head = pending_bios->head; | ||
135 | pending_bios->head = head; | ||
136 | if (pending_bios->tail) | ||
137 | tail->bi_next = old_head; | ||
138 | else | ||
139 | pending_bios->tail = tail; | ||
140 | } | ||
141 | |||
128 | /* | 142 | /* |
129 | * we try to collect pending bios for a device so we don't get a large | 143 | * we try to collect pending bios for a device so we don't get a large |
130 | * number of procs sending bios down to the same device. This greatly | 144 | * number of procs sending bios down to the same device. This greatly |
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) | |||
141 | struct bio *pending; | 155 | struct bio *pending; |
142 | struct backing_dev_info *bdi; | 156 | struct backing_dev_info *bdi; |
143 | struct btrfs_fs_info *fs_info; | 157 | struct btrfs_fs_info *fs_info; |
158 | struct btrfs_pending_bios *pending_bios; | ||
144 | struct bio *tail; | 159 | struct bio *tail; |
145 | struct bio *cur; | 160 | struct bio *cur; |
146 | int again = 0; | 161 | int again = 0; |
147 | unsigned long num_run = 0; | 162 | unsigned long num_run; |
163 | unsigned long num_sync_run; | ||
148 | unsigned long limit; | 164 | unsigned long limit; |
149 | unsigned long last_waited = 0; | 165 | unsigned long last_waited = 0; |
150 | 166 | ||
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) | |||
153 | limit = btrfs_async_submit_limit(fs_info); | 169 | limit = btrfs_async_submit_limit(fs_info); |
154 | limit = limit * 2 / 3; | 170 | limit = limit * 2 / 3; |
155 | 171 | ||
172 | /* we want to make sure that every time we switch from the sync | ||
173 | * list to the normal list, we unplug | ||
174 | */ | ||
175 | num_sync_run = 0; | ||
176 | |||
156 | loop: | 177 | loop: |
157 | spin_lock(&device->io_lock); | 178 | spin_lock(&device->io_lock); |
179 | num_run = 0; | ||
158 | 180 | ||
159 | loop_lock: | 181 | loop_lock: |
182 | |||
160 | /* take all the bios off the list at once and process them | 183 | /* take all the bios off the list at once and process them |
161 | * later on (without the lock held). But, remember the | 184 | * later on (without the lock held). But, remember the |
162 | * tail and other pointers so the bios can be properly reinserted | 185 | * tail and other pointers so the bios can be properly reinserted |
163 | * into the list if we hit congestion | 186 | * into the list if we hit congestion |
164 | */ | 187 | */ |
165 | pending = device->pending_bios; | 188 | if (device->pending_sync_bios.head) |
166 | tail = device->pending_bio_tail; | 189 | pending_bios = &device->pending_sync_bios; |
190 | else | ||
191 | pending_bios = &device->pending_bios; | ||
192 | |||
193 | pending = pending_bios->head; | ||
194 | tail = pending_bios->tail; | ||
167 | WARN_ON(pending && !tail); | 195 | WARN_ON(pending && !tail); |
168 | device->pending_bios = NULL; | ||
169 | device->pending_bio_tail = NULL; | ||
170 | 196 | ||
171 | /* | 197 | /* |
172 | * if pending was null this time around, no bios need processing | 198 | * if pending was null this time around, no bios need processing |
@@ -176,16 +202,41 @@ loop_lock: | |||
176 | * device->running_pending is used to synchronize with the | 202 | * device->running_pending is used to synchronize with the |
177 | * schedule_bio code. | 203 | * schedule_bio code. |
178 | */ | 204 | */ |
179 | if (pending) { | 205 | if (device->pending_sync_bios.head == NULL && |
180 | again = 1; | 206 | device->pending_bios.head == NULL) { |
181 | device->running_pending = 1; | ||
182 | } else { | ||
183 | again = 0; | 207 | again = 0; |
184 | device->running_pending = 0; | 208 | device->running_pending = 0; |
209 | } else { | ||
210 | again = 1; | ||
211 | device->running_pending = 1; | ||
185 | } | 212 | } |
213 | |||
214 | pending_bios->head = NULL; | ||
215 | pending_bios->tail = NULL; | ||
216 | |||
186 | spin_unlock(&device->io_lock); | 217 | spin_unlock(&device->io_lock); |
187 | 218 | ||
219 | /* | ||
220 | * if we're doing the regular priority list, make sure we unplug | ||
221 | * for any high prio bios we've sent down | ||
222 | */ | ||
223 | if (pending_bios == &device->pending_bios && num_sync_run > 0) { | ||
224 | num_sync_run = 0; | ||
225 | blk_run_backing_dev(bdi, NULL); | ||
226 | } | ||
227 | |||
188 | while (pending) { | 228 | while (pending) { |
229 | |||
230 | rmb(); | ||
231 | if (pending_bios != &device->pending_sync_bios && | ||
232 | device->pending_sync_bios.head && | ||
233 | num_run > 16) { | ||
234 | cond_resched(); | ||
235 | spin_lock(&device->io_lock); | ||
236 | requeue_list(pending_bios, pending, tail); | ||
237 | goto loop_lock; | ||
238 | } | ||
239 | |||
189 | cur = pending; | 240 | cur = pending; |
190 | pending = pending->bi_next; | 241 | pending = pending->bi_next; |
191 | cur->bi_next = NULL; | 242 | cur->bi_next = NULL; |
@@ -196,10 +247,18 @@ loop_lock: | |||
196 | wake_up(&fs_info->async_submit_wait); | 247 | wake_up(&fs_info->async_submit_wait); |
197 | 248 | ||
198 | BUG_ON(atomic_read(&cur->bi_cnt) == 0); | 249 | BUG_ON(atomic_read(&cur->bi_cnt) == 0); |
199 | bio_get(cur); | ||
200 | submit_bio(cur->bi_rw, cur); | 250 | submit_bio(cur->bi_rw, cur); |
201 | bio_put(cur); | ||
202 | num_run++; | 251 | num_run++; |
252 | if (bio_sync(cur)) | ||
253 | num_sync_run++; | ||
254 | |||
255 | if (need_resched()) { | ||
256 | if (num_sync_run) { | ||
257 | blk_run_backing_dev(bdi, NULL); | ||
258 | num_sync_run = 0; | ||
259 | } | ||
260 | cond_resched(); | ||
261 | } | ||
203 | 262 | ||
204 | /* | 263 | /* |
205 | * we made progress, there is more work to do and the bdi | 264 | * we made progress, there is more work to do and the bdi |
@@ -208,7 +267,6 @@ loop_lock: | |||
208 | */ | 267 | */ |
209 | if (pending && bdi_write_congested(bdi) && num_run > 16 && | 268 | if (pending && bdi_write_congested(bdi) && num_run > 16 && |
210 | fs_info->fs_devices->open_devices > 1) { | 269 | fs_info->fs_devices->open_devices > 1) { |
211 | struct bio *old_head; | ||
212 | struct io_context *ioc; | 270 | struct io_context *ioc; |
213 | 271 | ||
214 | ioc = current->io_context; | 272 | ioc = current->io_context; |
@@ -233,17 +291,17 @@ loop_lock: | |||
233 | * against it before looping | 291 | * against it before looping |
234 | */ | 292 | */ |
235 | last_waited = ioc->last_waited; | 293 | last_waited = ioc->last_waited; |
294 | if (need_resched()) { | ||
295 | if (num_sync_run) { | ||
296 | blk_run_backing_dev(bdi, NULL); | ||
297 | num_sync_run = 0; | ||
298 | } | ||
299 | cond_resched(); | ||
300 | } | ||
236 | continue; | 301 | continue; |
237 | } | 302 | } |
238 | spin_lock(&device->io_lock); | 303 | spin_lock(&device->io_lock); |
239 | 304 | requeue_list(pending_bios, pending, tail); | |
240 | old_head = device->pending_bios; | ||
241 | device->pending_bios = pending; | ||
242 | if (device->pending_bio_tail) | ||
243 | tail->bi_next = old_head; | ||
244 | else | ||
245 | device->pending_bio_tail = tail; | ||
246 | |||
247 | device->running_pending = 1; | 305 | device->running_pending = 1; |
248 | 306 | ||
249 | spin_unlock(&device->io_lock); | 307 | spin_unlock(&device->io_lock); |
@@ -251,11 +309,18 @@ loop_lock: | |||
251 | goto done; | 309 | goto done; |
252 | } | 310 | } |
253 | } | 311 | } |
312 | |||
313 | if (num_sync_run) { | ||
314 | num_sync_run = 0; | ||
315 | blk_run_backing_dev(bdi, NULL); | ||
316 | } | ||
317 | |||
318 | cond_resched(); | ||
254 | if (again) | 319 | if (again) |
255 | goto loop; | 320 | goto loop; |
256 | 321 | ||
257 | spin_lock(&device->io_lock); | 322 | spin_lock(&device->io_lock); |
258 | if (device->pending_bios) | 323 | if (device->pending_bios.head || device->pending_sync_bios.head) |
259 | goto loop_lock; | 324 | goto loop_lock; |
260 | spin_unlock(&device->io_lock); | 325 | spin_unlock(&device->io_lock); |
261 | 326 | ||
@@ -2497,7 +2562,7 @@ again: | |||
2497 | max_errors = 1; | 2562 | max_errors = 1; |
2498 | } | 2563 | } |
2499 | } | 2564 | } |
2500 | if (multi_ret && rw == WRITE && | 2565 | if (multi_ret && (rw & (1 << BIO_RW)) && |
2501 | stripes_allocated < stripes_required) { | 2566 | stripes_allocated < stripes_required) { |
2502 | stripes_allocated = map->num_stripes; | 2567 | stripes_allocated = map->num_stripes; |
2503 | free_extent_map(em); | 2568 | free_extent_map(em); |
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root, | |||
2762 | int rw, struct bio *bio) | 2827 | int rw, struct bio *bio) |
2763 | { | 2828 | { |
2764 | int should_queue = 1; | 2829 | int should_queue = 1; |
2830 | struct btrfs_pending_bios *pending_bios; | ||
2765 | 2831 | ||
2766 | /* don't bother with additional async steps for reads, right now */ | 2832 | /* don't bother with additional async steps for reads, right now */ |
2767 | if (!(rw & (1 << BIO_RW))) { | 2833 | if (!(rw & (1 << BIO_RW))) { |
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root, | |||
2783 | bio->bi_rw |= rw; | 2849 | bio->bi_rw |= rw; |
2784 | 2850 | ||
2785 | spin_lock(&device->io_lock); | 2851 | spin_lock(&device->io_lock); |
2852 | if (bio_sync(bio)) | ||
2853 | pending_bios = &device->pending_sync_bios; | ||
2854 | else | ||
2855 | pending_bios = &device->pending_bios; | ||
2786 | 2856 | ||
2787 | if (device->pending_bio_tail) | 2857 | if (pending_bios->tail) |
2788 | device->pending_bio_tail->bi_next = bio; | 2858 | pending_bios->tail->bi_next = bio; |
2789 | 2859 | ||
2790 | device->pending_bio_tail = bio; | 2860 | pending_bios->tail = bio; |
2791 | if (!device->pending_bios) | 2861 | if (!pending_bios->head) |
2792 | device->pending_bios = bio; | 2862 | pending_bios->head = bio; |
2793 | if (device->running_pending) | 2863 | if (device->running_pending) |
2794 | should_queue = 0; | 2864 | should_queue = 0; |
2795 | 2865 | ||
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2185de72ff7d..5836327ba5dd 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -23,13 +23,22 @@ | |||
23 | #include "async-thread.h" | 23 | #include "async-thread.h" |
24 | 24 | ||
25 | struct buffer_head; | 25 | struct buffer_head; |
26 | struct btrfs_pending_bios { | ||
27 | struct bio *head; | ||
28 | struct bio *tail; | ||
29 | }; | ||
30 | |||
26 | struct btrfs_device { | 31 | struct btrfs_device { |
27 | struct list_head dev_list; | 32 | struct list_head dev_list; |
28 | struct list_head dev_alloc_list; | 33 | struct list_head dev_alloc_list; |
29 | struct btrfs_fs_devices *fs_devices; | 34 | struct btrfs_fs_devices *fs_devices; |
30 | struct btrfs_root *dev_root; | 35 | struct btrfs_root *dev_root; |
31 | struct bio *pending_bios; | 36 | |
32 | struct bio *pending_bio_tail; | 37 | /* regular prio bios */ |
38 | struct btrfs_pending_bios pending_bios; | ||
39 | /* WRITE_SYNC bios */ | ||
40 | struct btrfs_pending_bios pending_sync_bios; | ||
41 | |||
33 | int running_pending; | 42 | int running_pending; |
34 | u64 generation; | 43 | u64 generation; |
35 | 44 | ||