aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-11-06 22:02:51 -0500
committerChris Mason <chris.mason@oracle.com>2008-11-06 22:02:51 -0500
commit771ed689d2cd53439e28e095bc38fbe40a71429e (patch)
tree518801f7141928e398d40c2b5955720d4346ce1a /fs
parent4a69a41009c4ac691f7d9c289f5f37fabeddce46 (diff)
Btrfs: Optimize compressed writeback and reads
When reading compressed extents, try to put pages into the page cache for any pages covered by the compressed extent that readpages didn't already preload. Add an async work queue to handle transformations at delayed allocation processing time. Right now this is just compression. The workflow is: 1) Find offsets in the file marked for delayed allocation 2) Lock the pages 3) Lock the state bits 4) Call the async delalloc code The async delalloc code clears the state lock bits and delalloc bits. It is important this happens before the range goes into the work queue because otherwise it might deadlock with other work queue items that try to lock those extent bits. The file pages are compressed, and if the compression doesn't work the pages are written back directly. An ordered work queue is used to make sure the inodes are written in the same order that pdflush or writepages sent them down. This changes extent_write_cache_pages to let the writepage function update the wbc nr_written count. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/compression.c150
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c27
-rw-r--r--fs/btrfs/extent-tree.c6
-rw-r--r--fs/btrfs/extent_io.c140
-rw-r--r--fs/btrfs/extent_io.h13
-rw-r--r--fs/btrfs/file.c53
-rw-r--r--fs/btrfs/inode.c643
-rw-r--r--fs/btrfs/ordered-data.c13
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/zlib.c3
11 files changed, 849 insertions, 207 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 354913177ba6..284f21025bcc 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,7 @@
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h> 35#include <linux/version.h>
36#include <linux/pagevec.h>
36#include "ctree.h" 37#include "ctree.h"
37#include "disk-io.h" 38#include "disk-io.h"
38#include "transaction.h" 39#include "transaction.h"
@@ -145,9 +146,9 @@ static void end_compressed_bio_read(struct bio *bio, int err)
145 } 146 }
146 147
147 /* do io completion on the original bio */ 148 /* do io completion on the original bio */
148 if (cb->errors) 149 if (cb->errors) {
149 bio_io_error(cb->orig_bio); 150 bio_io_error(cb->orig_bio);
150 else 151 } else
151 bio_endio(cb->orig_bio, 0); 152 bio_endio(cb->orig_bio, 0);
152 153
153 /* finally free the cb struct */ 154 /* finally free the cb struct */
@@ -333,6 +334,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
333 } 334 }
334 bytes_left -= PAGE_CACHE_SIZE; 335 bytes_left -= PAGE_CACHE_SIZE;
335 first_byte += PAGE_CACHE_SIZE; 336 first_byte += PAGE_CACHE_SIZE;
337 cond_resched();
336 } 338 }
337 bio_get(bio); 339 bio_get(bio);
338 340
@@ -346,6 +348,130 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
346 return 0; 348 return 0;
347} 349}
348 350
351static noinline int add_ra_bio_pages(struct inode *inode,
352 u64 compressed_end,
353 struct compressed_bio *cb)
354{
355 unsigned long end_index;
356 unsigned long page_index;
357 u64 last_offset;
358 u64 isize = i_size_read(inode);
359 int ret;
360 struct page *page;
361 unsigned long nr_pages = 0;
362 struct extent_map *em;
363 struct address_space *mapping = inode->i_mapping;
364 struct pagevec pvec;
365 struct extent_map_tree *em_tree;
366 struct extent_io_tree *tree;
367 u64 end;
368 int misses = 0;
369
370 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
371 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
372 em_tree = &BTRFS_I(inode)->extent_tree;
373 tree = &BTRFS_I(inode)->io_tree;
374
375 if (isize == 0)
376 return 0;
377
378 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
379
380 pagevec_init(&pvec, 0);
381 while(last_offset < compressed_end) {
382 page_index = last_offset >> PAGE_CACHE_SHIFT;
383
384 if (page_index > end_index)
385 break;
386
387 rcu_read_lock();
388 page = radix_tree_lookup(&mapping->page_tree, page_index);
389 rcu_read_unlock();
390 if (page) {
391 misses++;
392 if (misses > 4)
393 break;
394 goto next;
395 }
396
397 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
398 if (!page)
399 break;
400
401 page->index = page_index;
402 /*
403 * what we want to do here is call add_to_page_cache_lru,
404 * but that isn't exported, so we reproduce it here
405 */
406 if (add_to_page_cache(page, mapping,
407 page->index, GFP_NOFS)) {
408 page_cache_release(page);
409 goto next;
410 }
411
412 /* open coding of lru_cache_add, also not exported */
413 page_cache_get(page);
414 if (!pagevec_add(&pvec, page))
415 __pagevec_lru_add(&pvec);
416
417 end = last_offset + PAGE_CACHE_SIZE - 1;
418 /*
419 * at this point, we have a locked page in the page cache
420 * for these bytes in the file. But, we have to make
421 * sure they map to this compressed extent on disk.
422 */
423 set_page_extent_mapped(page);
424 lock_extent(tree, last_offset, end, GFP_NOFS);
425 spin_lock(&em_tree->lock);
426 em = lookup_extent_mapping(em_tree, last_offset,
427 PAGE_CACHE_SIZE);
428 spin_unlock(&em_tree->lock);
429
430 if (!em || last_offset < em->start ||
431 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
432 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
433 free_extent_map(em);
434 unlock_extent(tree, last_offset, end, GFP_NOFS);
435 unlock_page(page);
436 page_cache_release(page);
437 break;
438 }
439 free_extent_map(em);
440
441 if (page->index == end_index) {
442 char *userpage;
443 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
444
445 if (zero_offset) {
446 int zeros;
447 zeros = PAGE_CACHE_SIZE - zero_offset;
448 userpage = kmap_atomic(page, KM_USER0);
449 memset(userpage + zero_offset, 0, zeros);
450 flush_dcache_page(page);
451 kunmap_atomic(userpage, KM_USER0);
452 }
453 }
454
455 ret = bio_add_page(cb->orig_bio, page,
456 PAGE_CACHE_SIZE, 0);
457
458 if (ret == PAGE_CACHE_SIZE) {
459 nr_pages++;
460 page_cache_release(page);
461 } else {
462 unlock_extent(tree, last_offset, end, GFP_NOFS);
463 unlock_page(page);
464 page_cache_release(page);
465 break;
466 }
467next:
468 last_offset += PAGE_CACHE_SIZE;
469 }
470 if (pagevec_count(&pvec))
471 __pagevec_lru_add(&pvec);
472 return 0;
473}
474
349/* 475/*
350 * for a compressed read, the bio we get passed has all the inode pages 476 * for a compressed read, the bio we get passed has all the inode pages
351 * in it. We don't actually do IO on those pages but allocate new ones 477 * in it. We don't actually do IO on those pages but allocate new ones
@@ -373,6 +499,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
373 struct block_device *bdev; 499 struct block_device *bdev;
374 struct bio *comp_bio; 500 struct bio *comp_bio;
375 u64 cur_disk_byte = (u64)bio->bi_sector << 9; 501 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
502 u64 em_len;
376 struct extent_map *em; 503 struct extent_map *em;
377 int ret; 504 int ret;
378 505
@@ -393,6 +520,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
393 520
394 cb->start = em->start; 521 cb->start = em->start;
395 compressed_len = em->block_len; 522 compressed_len = em->block_len;
523 em_len = em->len;
396 free_extent_map(em); 524 free_extent_map(em);
397 525
398 cb->len = uncompressed_len; 526 cb->len = uncompressed_len;
@@ -411,6 +539,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
411 } 539 }
412 cb->nr_pages = nr_pages; 540 cb->nr_pages = nr_pages;
413 541
542 add_ra_bio_pages(inode, cb->start + em_len, cb);
543
544 if (!btrfs_test_opt(root, NODATASUM) &&
545 !btrfs_test_flag(inode, NODATASUM)) {
546 btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
547 }
548
549 /* include any pages we added in add_ra-bio_pages */
550 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
551 cb->len = uncompressed_len;
552
414 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 553 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
415 comp_bio->bi_private = cb; 554 comp_bio->bi_private = cb;
416 comp_bio->bi_end_io = end_compressed_bio_read; 555 comp_bio->bi_end_io = end_compressed_bio_read;
@@ -442,9 +581,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
442 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, 581 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
443 GFP_NOFS); 582 GFP_NOFS);
444 atomic_inc(&cb->pending_bios); 583 atomic_inc(&cb->pending_bios);
445 bio->bi_private = cb; 584 comp_bio->bi_private = cb;
446 bio->bi_end_io = end_compressed_bio_write; 585 comp_bio->bi_end_io = end_compressed_bio_read;
447 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 586
587 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
448 } 588 }
449 cur_disk_byte += PAGE_CACHE_SIZE; 589 cur_disk_byte += PAGE_CACHE_SIZE;
450 } 590 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 689df070c8e9..c83cc5b2ded7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -625,8 +625,8 @@ struct btrfs_fs_info {
625 struct btrfs_transaction *running_transaction; 625 struct btrfs_transaction *running_transaction;
626 wait_queue_head_t transaction_throttle; 626 wait_queue_head_t transaction_throttle;
627 wait_queue_head_t transaction_wait; 627 wait_queue_head_t transaction_wait;
628 wait_queue_head_t async_submit_wait;
629 628
629 wait_queue_head_t async_submit_wait;
630 wait_queue_head_t tree_log_wait; 630 wait_queue_head_t tree_log_wait;
631 631
632 struct btrfs_super_block super_copy; 632 struct btrfs_super_block super_copy;
@@ -653,6 +653,7 @@ struct btrfs_fs_info {
653 atomic_t nr_async_submits; 653 atomic_t nr_async_submits;
654 atomic_t async_submit_draining; 654 atomic_t async_submit_draining;
655 atomic_t nr_async_bios; 655 atomic_t nr_async_bios;
656 atomic_t async_delalloc_pages;
656 atomic_t tree_log_writers; 657 atomic_t tree_log_writers;
657 atomic_t tree_log_commit; 658 atomic_t tree_log_commit;
658 unsigned long tree_log_batch; 659 unsigned long tree_log_batch;
@@ -677,6 +678,7 @@ struct btrfs_fs_info {
677 * two 678 * two
678 */ 679 */
679 struct btrfs_workers workers; 680 struct btrfs_workers workers;
681 struct btrfs_workers delalloc_workers;
680 struct btrfs_workers endio_workers; 682 struct btrfs_workers endio_workers;
681 struct btrfs_workers endio_write_workers; 683 struct btrfs_workers endio_write_workers;
682 struct btrfs_workers submit_workers; 684 struct btrfs_workers submit_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e0a28f705a64..8efc123d222b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -539,6 +539,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
539 (atomic_read(&fs_info->nr_async_bios) < limit), 539 (atomic_read(&fs_info->nr_async_bios) < limit),
540 HZ/10); 540 HZ/10);
541 } 541 }
542
543 while(atomic_read(&fs_info->async_submit_draining) &&
544 atomic_read(&fs_info->nr_async_submits)) {
545 wait_event(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_submits) == 0));
547 }
548
542 return 0; 549 return 0;
543} 550}
544 551
@@ -1437,6 +1444,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1437 INIT_LIST_HEAD(&fs_info->space_info); 1444 INIT_LIST_HEAD(&fs_info->space_info);
1438 btrfs_mapping_init(&fs_info->mapping_tree); 1445 btrfs_mapping_init(&fs_info->mapping_tree);
1439 atomic_set(&fs_info->nr_async_submits, 0); 1446 atomic_set(&fs_info->nr_async_submits, 0);
1447 atomic_set(&fs_info->async_delalloc_pages, 0);
1440 atomic_set(&fs_info->async_submit_draining, 0); 1448 atomic_set(&fs_info->async_submit_draining, 0);
1441 atomic_set(&fs_info->nr_async_bios, 0); 1449 atomic_set(&fs_info->nr_async_bios, 0);
1442 atomic_set(&fs_info->throttles, 0); 1450 atomic_set(&fs_info->throttles, 0);
@@ -1550,6 +1558,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1550 btrfs_init_workers(&fs_info->workers, "worker", 1558 btrfs_init_workers(&fs_info->workers, "worker",
1551 fs_info->thread_pool_size); 1559 fs_info->thread_pool_size);
1552 1560
1561 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1562 fs_info->thread_pool_size);
1563
1553 btrfs_init_workers(&fs_info->submit_workers, "submit", 1564 btrfs_init_workers(&fs_info->submit_workers, "submit",
1554 min_t(u64, fs_devices->num_devices, 1565 min_t(u64, fs_devices->num_devices,
1555 fs_info->thread_pool_size)); 1566 fs_info->thread_pool_size));
@@ -1560,15 +1571,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1560 */ 1571 */
1561 fs_info->submit_workers.idle_thresh = 64; 1572 fs_info->submit_workers.idle_thresh = 64;
1562 1573
1563 /* fs_info->workers is responsible for checksumming file data 1574 fs_info->workers.idle_thresh = 16;
1564 * blocks and metadata. Using a larger idle thresh allows each
1565 * worker thread to operate on things in roughly the order they
1566 * were sent by the writeback daemons, improving overall locality
1567 * of the IO going down the pipe.
1568 */
1569 fs_info->workers.idle_thresh = 8;
1570 fs_info->workers.ordered = 1; 1575 fs_info->workers.ordered = 1;
1571 1576
1577 fs_info->delalloc_workers.idle_thresh = 2;
1578 fs_info->delalloc_workers.ordered = 1;
1579
1572 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1580 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1573 btrfs_init_workers(&fs_info->endio_workers, "endio", 1581 btrfs_init_workers(&fs_info->endio_workers, "endio",
1574 fs_info->thread_pool_size); 1582 fs_info->thread_pool_size);
@@ -1584,6 +1592,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1584 1592
1585 btrfs_start_workers(&fs_info->workers, 1); 1593 btrfs_start_workers(&fs_info->workers, 1);
1586 btrfs_start_workers(&fs_info->submit_workers, 1); 1594 btrfs_start_workers(&fs_info->submit_workers, 1);
1595 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1587 btrfs_start_workers(&fs_info->fixup_workers, 1); 1596 btrfs_start_workers(&fs_info->fixup_workers, 1);
1588 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1597 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1589 btrfs_start_workers(&fs_info->endio_write_workers, 1598 btrfs_start_workers(&fs_info->endio_write_workers,
@@ -1732,6 +1741,7 @@ fail_tree_root:
1732fail_sys_array: 1741fail_sys_array:
1733fail_sb_buffer: 1742fail_sb_buffer:
1734 btrfs_stop_workers(&fs_info->fixup_workers); 1743 btrfs_stop_workers(&fs_info->fixup_workers);
1744 btrfs_stop_workers(&fs_info->delalloc_workers);
1735 btrfs_stop_workers(&fs_info->workers); 1745 btrfs_stop_workers(&fs_info->workers);
1736 btrfs_stop_workers(&fs_info->endio_workers); 1746 btrfs_stop_workers(&fs_info->endio_workers);
1737 btrfs_stop_workers(&fs_info->endio_write_workers); 1747 btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1988,6 +1998,7 @@ int close_ctree(struct btrfs_root *root)
1988 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); 1998 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
1989 1999
1990 btrfs_stop_workers(&fs_info->fixup_workers); 2000 btrfs_stop_workers(&fs_info->fixup_workers);
2001 btrfs_stop_workers(&fs_info->delalloc_workers);
1991 btrfs_stop_workers(&fs_info->workers); 2002 btrfs_stop_workers(&fs_info->workers);
1992 btrfs_stop_workers(&fs_info->endio_workers); 2003 btrfs_stop_workers(&fs_info->endio_workers);
1993 btrfs_stop_workers(&fs_info->endio_write_workers); 2004 btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -2062,7 +2073,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2062 struct extent_io_tree *tree; 2073 struct extent_io_tree *tree;
2063 u64 num_dirty; 2074 u64 num_dirty;
2064 u64 start = 0; 2075 u64 start = 0;
2065 unsigned long thresh = 96 * 1024 * 1024; 2076 unsigned long thresh = 32 * 1024 * 1024;
2066 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 2077 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2067 2078
2068 if (current_is_pdflush() || current->flags & PF_MEMALLOC) 2079 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8af39521eb71..ebd8275a1934 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -768,7 +768,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
768 l = path->nodes[0]; 768 l = path->nodes[0];
769 769
770 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
771 BUG_ON(key.objectid != bytenr); 771 if (key.objectid != bytenr) {
772 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
773 printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
774 BUG();
775 }
772 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 776 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
773 777
774 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 778 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9b37ce6e5168..bbe3bcfcf4ae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -47,6 +47,11 @@ struct extent_page_data {
47 struct bio *bio; 47 struct bio *bio;
48 struct extent_io_tree *tree; 48 struct extent_io_tree *tree;
49 get_extent_t *get_extent; 49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
50}; 55};
51 56
52int __init extent_io_init(void) 57int __init extent_io_init(void)
@@ -1198,11 +1203,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
1198 * the caller is taking responsibility for 1203 * the caller is taking responsibility for
1199 * locked_page 1204 * locked_page
1200 */ 1205 */
1201 if (pages[i] != locked_page) 1206 if (pages[i] != locked_page) {
1202 lock_page(pages[i]); 1207 lock_page(pages[i]);
1208 if (pages[i]->mapping != inode->i_mapping) {
1209 ret = -EAGAIN;
1210 unlock_page(pages[i]);
1211 page_cache_release(pages[i]);
1212 goto done;
1213 }
1214 }
1203 page_cache_release(pages[i]); 1215 page_cache_release(pages[i]);
1216 pages_locked++;
1204 } 1217 }
1205 pages_locked += ret;
1206 nrpages -= ret; 1218 nrpages -= ret;
1207 index += ret; 1219 index += ret;
1208 cond_resched(); 1220 cond_resched();
@@ -1262,8 +1274,7 @@ again:
1262 * if we're looping. 1274 * if we're looping.
1263 */ 1275 */
1264 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { 1276 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1265 delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & 1277 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1266 ~((u64)PAGE_CACHE_SIZE - 1);
1267 } 1278 }
1268 /* step two, lock all the pages after the page that has start */ 1279 /* step two, lock all the pages after the page that has start */
1269 ret = lock_delalloc_pages(inode, locked_page, 1280 ret = lock_delalloc_pages(inode, locked_page,
@@ -1306,7 +1317,10 @@ out_failed:
1306int extent_clear_unlock_delalloc(struct inode *inode, 1317int extent_clear_unlock_delalloc(struct inode *inode,
1307 struct extent_io_tree *tree, 1318 struct extent_io_tree *tree,
1308 u64 start, u64 end, struct page *locked_page, 1319 u64 start, u64 end, struct page *locked_page,
1309 int clear_dirty, int set_writeback, 1320 int unlock_pages,
1321 int clear_unlock,
1322 int clear_delalloc, int clear_dirty,
1323 int set_writeback,
1310 int end_writeback) 1324 int end_writeback)
1311{ 1325{
1312 int ret; 1326 int ret;
@@ -1315,12 +1329,19 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1315 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1329 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1316 unsigned long nr_pages = end_index - index + 1; 1330 unsigned long nr_pages = end_index - index + 1;
1317 int i; 1331 int i;
1318 int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; 1332 int clear_bits = 0;
1319 1333
1334 if (clear_unlock)
1335 clear_bits |= EXTENT_LOCKED;
1320 if (clear_dirty) 1336 if (clear_dirty)
1321 clear_bits |= EXTENT_DIRTY; 1337 clear_bits |= EXTENT_DIRTY;
1322 1338
1339 if (clear_delalloc)
1340 clear_bits |= EXTENT_DELALLOC;
1341
1323 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); 1342 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1343 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1344 return 0;
1324 1345
1325 while(nr_pages > 0) { 1346 while(nr_pages > 0) {
1326 ret = find_get_pages_contig(inode->i_mapping, index, 1347 ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1336,7 +1357,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1336 set_page_writeback(pages[i]); 1357 set_page_writeback(pages[i]);
1337 if (end_writeback) 1358 if (end_writeback)
1338 end_page_writeback(pages[i]); 1359 end_page_writeback(pages[i]);
1339 unlock_page(pages[i]); 1360 if (unlock_pages)
1361 unlock_page(pages[i]);
1340 page_cache_release(pages[i]); 1362 page_cache_release(pages[i]);
1341 } 1363 }
1342 nr_pages -= ret; 1364 nr_pages -= ret;
@@ -1741,9 +1763,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1741 } 1763 }
1742 } 1764 }
1743 1765
1744 if (uptodate) 1766 if (uptodate) {
1745 set_extent_uptodate(tree, start, end, 1767 set_extent_uptodate(tree, start, end,
1746 GFP_ATOMIC); 1768 GFP_ATOMIC);
1769 }
1747 unlock_extent(tree, start, end, GFP_ATOMIC); 1770 unlock_extent(tree, start, end, GFP_ATOMIC);
1748 1771
1749 if (whole_page) { 1772 if (whole_page) {
@@ -1925,6 +1948,7 @@ void set_page_extent_mapped(struct page *page)
1925 set_page_private(page, EXTENT_PAGE_PRIVATE); 1948 set_page_private(page, EXTENT_PAGE_PRIVATE);
1926 } 1949 }
1927} 1950}
1951EXPORT_SYMBOL(set_page_extent_mapped);
1928 1952
1929void set_page_extent_head(struct page *page, unsigned long len) 1953void set_page_extent_head(struct page *page, unsigned long len)
1930{ 1954{
@@ -2143,12 +2167,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2143 u64 delalloc_end; 2167 u64 delalloc_end;
2144 int page_started; 2168 int page_started;
2145 int compressed; 2169 int compressed;
2170 unsigned long nr_written = 0;
2146 2171
2147 WARN_ON(!PageLocked(page)); 2172 WARN_ON(!PageLocked(page));
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2173 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2174 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2175 (page->index == end_index && !pg_offset)) {
2151 page->mapping->a_ops->invalidatepage(page, 0); 2176 if (epd->extent_locked) {
2177 if (tree->ops && tree->ops->writepage_end_io_hook)
2178 tree->ops->writepage_end_io_hook(page, start,
2179 page_end, NULL, 1);
2180 }
2152 unlock_page(page); 2181 unlock_page(page);
2153 return 0; 2182 return 0;
2154 } 2183 }
@@ -2169,27 +2198,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2169 delalloc_start = start; 2198 delalloc_start = start;
2170 delalloc_end = 0; 2199 delalloc_end = 0;
2171 page_started = 0; 2200 page_started = 0;
2172 while(delalloc_end < page_end) { 2201 if (!epd->extent_locked) {
2173 nr_delalloc = find_lock_delalloc_range(inode, tree, 2202 while(delalloc_end < page_end) {
2203 nr_delalloc = find_lock_delalloc_range(inode, tree,
2174 page, 2204 page,
2175 &delalloc_start, 2205 &delalloc_start,
2176 &delalloc_end, 2206 &delalloc_end,
2177 128 * 1024 * 1024); 2207 128 * 1024 * 1024);
2178 if (nr_delalloc == 0) { 2208 if (nr_delalloc == 0) {
2209 delalloc_start = delalloc_end + 1;
2210 continue;
2211 }
2212 tree->ops->fill_delalloc(inode, page, delalloc_start,
2213 delalloc_end, &page_started,
2214 &nr_written);
2179 delalloc_start = delalloc_end + 1; 2215 delalloc_start = delalloc_end + 1;
2180 continue;
2181 } 2216 }
2182 tree->ops->fill_delalloc(inode, page, delalloc_start,
2183 delalloc_end, &page_started);
2184 delalloc_start = delalloc_end + 1;
2185 }
2186 2217
2187 /* did the fill delalloc function already unlock and start the IO? */ 2218 /* did the fill delalloc function already unlock and start
2188 if (page_started) { 2219 * the IO?
2189 return 0; 2220 */
2221 if (page_started) {
2222 ret = 0;
2223 goto update_nr_written;
2224 }
2190 } 2225 }
2191
2192 lock_extent(tree, start, page_end, GFP_NOFS); 2226 lock_extent(tree, start, page_end, GFP_NOFS);
2227
2193 unlock_start = start; 2228 unlock_start = start;
2194 2229
2195 if (tree->ops && tree->ops->writepage_start_hook) { 2230 if (tree->ops && tree->ops->writepage_start_hook) {
@@ -2199,10 +2234,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2199 unlock_extent(tree, start, page_end, GFP_NOFS); 2234 unlock_extent(tree, start, page_end, GFP_NOFS);
2200 redirty_page_for_writepage(wbc, page); 2235 redirty_page_for_writepage(wbc, page);
2201 unlock_page(page); 2236 unlock_page(page);
2202 return 0; 2237 ret = 0;
2238 goto update_nr_written;
2203 } 2239 }
2204 } 2240 }
2205 2241
2242 nr_written++;
2243
2206 end = page_end; 2244 end = page_end;
2207 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 2245 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
2208 printk("found delalloc bits after lock_extent\n"); 2246 printk("found delalloc bits after lock_extent\n");
@@ -2333,6 +2371,12 @@ done:
2333 if (unlock_start <= page_end) 2371 if (unlock_start <= page_end)
2334 unlock_extent(tree, unlock_start, page_end, GFP_NOFS); 2372 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2335 unlock_page(page); 2373 unlock_page(page);
2374
2375update_nr_written:
2376 wbc->nr_to_write -= nr_written;
2377 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2378 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2379 page->mapping->writeback_index = page->index + nr_written;
2336 return 0; 2380 return 0;
2337} 2381}
2338 2382
@@ -2431,7 +2475,7 @@ retry:
2431 unlock_page(page); 2475 unlock_page(page);
2432 ret = 0; 2476 ret = 0;
2433 } 2477 }
2434 if (ret || (--(wbc->nr_to_write) <= 0)) 2478 if (ret || wbc->nr_to_write <= 0)
2435 done = 1; 2479 done = 1;
2436 if (wbc->nonblocking && bdi_write_congested(bdi)) { 2480 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2437 wbc->encountered_congestion = 1; 2481 wbc->encountered_congestion = 1;
@@ -2452,6 +2496,8 @@ retry:
2452 } 2496 }
2453 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2497 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2454 mapping->writeback_index = index; 2498 mapping->writeback_index = index;
2499 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2500 range_whole = 1;
2455 2501
2456 if (wbc->range_cont) 2502 if (wbc->range_cont)
2457 wbc->range_start = index << PAGE_CACHE_SHIFT; 2503 wbc->range_start = index << PAGE_CACHE_SHIFT;
@@ -2469,6 +2515,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2469 .bio = NULL, 2515 .bio = NULL,
2470 .tree = tree, 2516 .tree = tree,
2471 .get_extent = get_extent, 2517 .get_extent = get_extent,
2518 .extent_locked = 0,
2472 }; 2519 };
2473 struct writeback_control wbc_writepages = { 2520 struct writeback_control wbc_writepages = {
2474 .bdi = wbc->bdi, 2521 .bdi = wbc->bdi,
@@ -2491,6 +2538,52 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2491} 2538}
2492EXPORT_SYMBOL(extent_write_full_page); 2539EXPORT_SYMBOL(extent_write_full_page);
2493 2540
2541int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2542 u64 start, u64 end, get_extent_t *get_extent,
2543 int mode)
2544{
2545 int ret = 0;
2546 struct address_space *mapping = inode->i_mapping;
2547 struct page *page;
2548 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2549 PAGE_CACHE_SHIFT;
2550
2551 struct extent_page_data epd = {
2552 .bio = NULL,
2553 .tree = tree,
2554 .get_extent = get_extent,
2555 .extent_locked = 1,
2556 };
2557 struct writeback_control wbc_writepages = {
2558 .bdi = inode->i_mapping->backing_dev_info,
2559 .sync_mode = mode,
2560 .older_than_this = NULL,
2561 .nr_to_write = nr_pages * 2,
2562 .range_start = start,
2563 .range_end = end + 1,
2564 };
2565
2566 while(start <= end) {
2567 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2568 if (clear_page_dirty_for_io(page))
2569 ret = __extent_writepage(page, &wbc_writepages, &epd);
2570 else {
2571 if (tree->ops && tree->ops->writepage_end_io_hook)
2572 tree->ops->writepage_end_io_hook(page, start,
2573 start + PAGE_CACHE_SIZE - 1,
2574 NULL, 1);
2575 unlock_page(page);
2576 }
2577 page_cache_release(page);
2578 start += PAGE_CACHE_SIZE;
2579 }
2580
2581 if (epd.bio)
2582 submit_one_bio(WRITE, epd.bio, 0, 0);
2583 return ret;
2584}
2585EXPORT_SYMBOL(extent_write_locked_range);
2586
2494 2587
2495int extent_writepages(struct extent_io_tree *tree, 2588int extent_writepages(struct extent_io_tree *tree,
2496 struct address_space *mapping, 2589 struct address_space *mapping,
@@ -2502,6 +2595,7 @@ int extent_writepages(struct extent_io_tree *tree,
2502 .bio = NULL, 2595 .bio = NULL,
2503 .tree = tree, 2596 .tree = tree,
2504 .get_extent = get_extent, 2597 .get_extent = get_extent,
2598 .extent_locked = 0,
2505 }; 2599 };
2506 2600
2507 ret = extent_write_cache_pages(tree, mapping, wbc, 2601 ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 283110ec4ee0..2d5f67065b69 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,7 +35,8 @@ typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 unsigned long bio_flags); 35 unsigned long bio_flags);
36struct extent_io_ops { 36struct extent_io_ops {
37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
38 u64 start, u64 end, int *page_started); 38 u64 start, u64 end, int *page_started,
39 unsigned long *nr_written);
39 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 40 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
40 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 41 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
41 extent_submit_bio_hook_t *submit_bio_hook; 42 extent_submit_bio_hook_t *submit_bio_hook;
@@ -172,6 +173,9 @@ int extent_invalidatepage(struct extent_io_tree *tree,
172int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 173int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
173 get_extent_t *get_extent, 174 get_extent_t *get_extent,
174 struct writeback_control *wbc); 175 struct writeback_control *wbc);
176int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
177 u64 start, u64 end, get_extent_t *get_extent,
178 int mode);
175int extent_writepages(struct extent_io_tree *tree, 179int extent_writepages(struct extent_io_tree *tree,
176 struct address_space *mapping, 180 struct address_space *mapping,
177 get_extent_t *get_extent, 181 get_extent_t *get_extent,
@@ -256,6 +260,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
256int extent_clear_unlock_delalloc(struct inode *inode, 260int extent_clear_unlock_delalloc(struct inode *inode,
257 struct extent_io_tree *tree, 261 struct extent_io_tree *tree,
258 u64 start, u64 end, struct page *locked_page, 262 u64 start, u64 end, struct page *locked_page,
259 int clear_dirty, int set_writeback, 263 int unlock_page,
260 int clear_writeback); 264 int clear_unlock,
265 int clear_delalloc, int clear_dirty,
266 int set_writeback,
267 int end_writeback);
261#endif 268#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0c8cc35a8b97..337221ecca27 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -368,6 +368,8 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
368 u64 search_start = start; 368 u64 search_start = start;
369 u64 leaf_start; 369 u64 leaf_start;
370 u64 ram_bytes = 0; 370 u64 ram_bytes = 0;
371 u64 orig_parent = 0;
372 u64 disk_bytenr = 0;
371 u8 compression; 373 u8 compression;
372 u8 encryption; 374 u8 encryption;
373 u16 other_encoding = 0; 375 u16 other_encoding = 0;
@@ -500,17 +502,31 @@ next_slot:
500 keep = 1; 502 keep = 1;
501 } 503 }
502 504
503 if (bookend && found_extent && locked_end < extent_end) { 505 if (bookend && found_extent) {
504 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 506 if (locked_end < extent_end) {
505 locked_end, extent_end - 1, GFP_NOFS); 507 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
506 if (!ret) { 508 locked_end, extent_end - 1,
507 btrfs_release_path(root, path); 509 GFP_NOFS);
508 lock_extent(&BTRFS_I(inode)->io_tree, 510 if (!ret) {
509 locked_end, extent_end - 1, GFP_NOFS); 511 btrfs_release_path(root, path);
512 lock_extent(&BTRFS_I(inode)->io_tree,
513 locked_end, extent_end - 1,
514 GFP_NOFS);
515 locked_end = extent_end;
516 continue;
517 }
510 locked_end = extent_end; 518 locked_end = extent_end;
511 continue;
512 } 519 }
513 locked_end = extent_end; 520 orig_parent = path->nodes[0]->start;
521 disk_bytenr = le64_to_cpu(old.disk_bytenr);
522 if (disk_bytenr != 0) {
523 ret = btrfs_inc_extent_ref(trans, root,
524 disk_bytenr,
525 le64_to_cpu(old.disk_num_bytes),
526 orig_parent, root->root_key.objectid,
527 trans->transid, inode->i_ino);
528 BUG_ON(ret);
529 }
514 } 530 }
515 531
516 if (found_inline) { 532 if (found_inline) {
@@ -537,8 +553,12 @@ next_slot:
537 inode_sub_bytes(inode, old_num - 553 inode_sub_bytes(inode, old_num -
538 new_num); 554 new_num);
539 } 555 }
540 btrfs_set_file_extent_num_bytes(leaf, extent, 556 if (!compression && !encryption) {
541 new_num); 557 btrfs_set_file_extent_ram_bytes(leaf,
558 extent, new_num);
559 }
560 btrfs_set_file_extent_num_bytes(leaf,
561 extent, new_num);
542 btrfs_mark_buffer_dirty(leaf); 562 btrfs_mark_buffer_dirty(leaf);
543 } else if (key.offset < inline_limit && 563 } else if (key.offset < inline_limit &&
544 (end > extent_end) && 564 (end > extent_end) &&
@@ -582,11 +602,11 @@ next_slot:
582 } 602 }
583 /* create bookend, splitting the extent in two */ 603 /* create bookend, splitting the extent in two */
584 if (bookend && found_extent) { 604 if (bookend && found_extent) {
585 u64 disk_bytenr;
586 struct btrfs_key ins; 605 struct btrfs_key ins;
587 ins.objectid = inode->i_ino; 606 ins.objectid = inode->i_ino;
588 ins.offset = end; 607 ins.offset = end;
589 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 608 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
609
590 btrfs_release_path(root, path); 610 btrfs_release_path(root, path);
591 ret = btrfs_insert_empty_item(trans, root, path, &ins, 611 ret = btrfs_insert_empty_item(trans, root, path, &ins,
592 sizeof(*extent)); 612 sizeof(*extent));
@@ -623,14 +643,13 @@ next_slot:
623 643
624 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
625 645
626 disk_bytenr = le64_to_cpu(old.disk_bytenr);
627 if (disk_bytenr != 0) { 646 if (disk_bytenr != 0) {
628 ret = btrfs_inc_extent_ref(trans, root, 647 ret = btrfs_update_extent_ref(trans, root,
629 disk_bytenr, 648 disk_bytenr, orig_parent,
630 le64_to_cpu(old.disk_num_bytes), 649 leaf->start,
631 leaf->start,
632 root->root_key.objectid, 650 root->root_key.objectid,
633 trans->transid, ins.objectid); 651 trans->transid, ins.objectid);
652
634 BUG_ON(ret); 653 BUG_ON(ret);
635 } 654 }
636 btrfs_release_path(root, path); 655 btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3df0ffad976e..e01c0d0310ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,6 +86,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
86 86
87static void btrfs_truncate(struct inode *inode); 87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
89 93
90/* 94/*
91 * a very lame attempt at stopping writes when the FS is 85% full. There 95 * a very lame attempt at stopping writes when the FS is 85% full. There
@@ -262,35 +266,72 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
262 return 0; 266 return 0;
263} 267}
264 268
269struct async_extent {
270 u64 start;
271 u64 ram_size;
272 u64 compressed_size;
273 struct page **pages;
274 unsigned long nr_pages;
275 struct list_head list;
276};
277
278struct async_cow {
279 struct inode *inode;
280 struct btrfs_root *root;
281 struct page *locked_page;
282 u64 start;
283 u64 end;
284 struct list_head extents;
285 struct btrfs_work work;
286};
287
288static noinline int add_async_extent(struct async_cow *cow,
289 u64 start, u64 ram_size,
290 u64 compressed_size,
291 struct page **pages,
292 unsigned long nr_pages)
293{
294 struct async_extent *async_extent;
295
296 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
297 async_extent->start = start;
298 async_extent->ram_size = ram_size;
299 async_extent->compressed_size = compressed_size;
300 async_extent->pages = pages;
301 async_extent->nr_pages = nr_pages;
302 list_add_tail(&async_extent->list, &cow->extents);
303 return 0;
304}
305
265/* 306/*
266 * when extent_io.c finds a delayed allocation range in the file, 307 * we create compressed extents in two phases. The first
267 * the call backs end up in this code. The basic idea is to 308 * phase compresses a range of pages that have already been
268 * allocate extents on disk for the range, and create ordered data structs 309 * locked (both pages and state bits are locked).
269 * in ram to track those extents.
270 * 310 *
271 * locked_page is the page that writepage had locked already. We use 311 * This is done inside an ordered work queue, and the compression
272 * it to make sure we don't do extra locks or unlocks. 312 * is spread across many cpus. The actual IO submission is step
313 * two, and the ordered work queue takes care of making sure that
314 * happens in the same order things were put onto the queue by
315 * writepages and friends.
273 * 316 *
274 * *page_started is set to one if we unlock locked_page and do everything 317 * If this code finds it can't get good compression, it puts an
275 * required to start IO on it. It may be clean and already done with 318 * entry onto the work queue to write the uncompressed bytes. This
276 * IO when we return. 319 * makes sure that both compressed inodes and uncompressed inodes
320 * are written in the same order that pdflush sent them down.
277 */ 321 */
278static int cow_file_range(struct inode *inode, struct page *locked_page, 322static noinline int compress_file_range(struct inode *inode,
279 u64 start, u64 end, int *page_started) 323 struct page *locked_page,
324 u64 start, u64 end,
325 struct async_cow *async_cow,
326 int *num_added)
280{ 327{
281 struct btrfs_root *root = BTRFS_I(inode)->root; 328 struct btrfs_root *root = BTRFS_I(inode)->root;
282 struct btrfs_trans_handle *trans; 329 struct btrfs_trans_handle *trans;
283 u64 alloc_hint = 0;
284 u64 num_bytes; 330 u64 num_bytes;
285 unsigned long ram_size;
286 u64 orig_start; 331 u64 orig_start;
287 u64 disk_num_bytes; 332 u64 disk_num_bytes;
288 u64 cur_alloc_size;
289 u64 blocksize = root->sectorsize; 333 u64 blocksize = root->sectorsize;
290 u64 actual_end; 334 u64 actual_end;
291 struct btrfs_key ins;
292 struct extent_map *em;
293 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
294 int ret = 0; 335 int ret = 0;
295 struct page **pages = NULL; 336 struct page **pages = NULL;
296 unsigned long nr_pages; 337 unsigned long nr_pages;
@@ -298,22 +339,12 @@ static int cow_file_range(struct inode *inode, struct page *locked_page,
298 unsigned long total_compressed = 0; 339 unsigned long total_compressed = 0;
299 unsigned long total_in = 0; 340 unsigned long total_in = 0;
300 unsigned long max_compressed = 128 * 1024; 341 unsigned long max_compressed = 128 * 1024;
301 unsigned long max_uncompressed = 256 * 1024; 342 unsigned long max_uncompressed = 128 * 1024;
302 int i; 343 int i;
303 int ordered_type;
304 int will_compress; 344 int will_compress;
305 345
306 trans = btrfs_join_transaction(root, 1);
307 BUG_ON(!trans);
308 btrfs_set_trans_block_group(trans, inode);
309 orig_start = start; 346 orig_start = start;
310 347
311 /*
312 * compression made this loop a bit ugly, but the basic idea is to
313 * compress some pages but keep the total size of the compressed
314 * extent relatively small. If compression is off, this goto target
315 * is never used.
316 */
317again: 348again:
318 will_compress = 0; 349 will_compress = 0;
319 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
@@ -324,7 +355,13 @@ again:
324 355
325 /* we want to make sure that amount of ram required to uncompress 356 /* we want to make sure that amount of ram required to uncompress
326 * an extent is reasonable, so we limit the total size in ram 357 * an extent is reasonable, so we limit the total size in ram
327 * of a compressed extent to 256k 358 * of a compressed extent to 128k. This is a crucial number
359 * because it also controls how easily we can spread reads across
360 * cpus for decompression.
361 *
362 * We also want to make sure the amount of IO required to do
363 * a random read is reasonably small, so we limit the size of
364 * a compressed extent to 128k.
328 */ 365 */
329 total_compressed = min(total_compressed, max_uncompressed); 366 total_compressed = min(total_compressed, max_uncompressed);
330 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 367 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -333,18 +370,16 @@ again:
333 total_in = 0; 370 total_in = 0;
334 ret = 0; 371 ret = 0;
335 372
336 /* we do compression for mount -o compress and when the 373 /*
337 * inode has not been flagged as nocompress 374 * we do compression for mount -o compress and when the
375 * inode has not been flagged as nocompress. This flag can
376 * change at any time if we discover bad compression ratios.
338 */ 377 */
339 if (!btrfs_test_flag(inode, NOCOMPRESS) && 378 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
340 btrfs_test_opt(root, COMPRESS)) { 379 btrfs_test_opt(root, COMPRESS)) {
341 WARN_ON(pages); 380 WARN_ON(pages);
342 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 381 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
343 382
344 /* we want to make sure the amount of IO required to satisfy
345 * a random read is reasonably small, so we limit the size
346 * of a compressed extent to 128k
347 */
348 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 383 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
349 total_compressed, pages, 384 total_compressed, pages,
350 nr_pages, &nr_pages_ret, 385 nr_pages, &nr_pages_ret,
@@ -371,26 +406,34 @@ again:
371 } 406 }
372 } 407 }
373 if (start == 0) { 408 if (start == 0) {
409 trans = btrfs_join_transaction(root, 1);
410 BUG_ON(!trans);
411 btrfs_set_trans_block_group(trans, inode);
412
374 /* lets try to make an inline extent */ 413 /* lets try to make an inline extent */
375 if (ret || total_in < (end - start + 1)) { 414 if (ret || total_in < (actual_end - start)) {
376 /* we didn't compress the entire range, try 415 /* we didn't compress the entire range, try
377 * to make an uncompressed inline extent. This 416 * to make an uncompressed inline extent.
378 * is almost sure to fail, but maybe inline sizes
379 * will get bigger later
380 */ 417 */
381 ret = cow_file_range_inline(trans, root, inode, 418 ret = cow_file_range_inline(trans, root, inode,
382 start, end, 0, NULL); 419 start, end, 0, NULL);
383 } else { 420 } else {
421 /* try making a compressed inline extent */
384 ret = cow_file_range_inline(trans, root, inode, 422 ret = cow_file_range_inline(trans, root, inode,
385 start, end, 423 start, end,
386 total_compressed, pages); 424 total_compressed, pages);
387 } 425 }
426 btrfs_end_transaction(trans, root);
388 if (ret == 0) { 427 if (ret == 0) {
428 /*
429 * inline extent creation worked, we don't need
430 * to create any more async work items. Unlock
431 * and free up our temp pages.
432 */
389 extent_clear_unlock_delalloc(inode, 433 extent_clear_unlock_delalloc(inode,
390 &BTRFS_I(inode)->io_tree, 434 &BTRFS_I(inode)->io_tree,
391 start, end, NULL, 435 start, end, NULL, 1, 0,
392 1, 1, 1); 436 0, 1, 1, 1);
393 *page_started = 1;
394 ret = 0; 437 ret = 0;
395 goto free_pages_out; 438 goto free_pages_out;
396 } 439 }
@@ -435,53 +478,280 @@ again:
435 /* flag the file so we don't compress in the future */ 478 /* flag the file so we don't compress in the future */
436 btrfs_set_flag(inode, NOCOMPRESS); 479 btrfs_set_flag(inode, NOCOMPRESS);
437 } 480 }
481 if (will_compress) {
482 *num_added += 1;
438 483
439 BUG_ON(disk_num_bytes > 484 /* the async work queues will take care of doing actual
440 btrfs_super_total_bytes(&root->fs_info->super_copy)); 485 * allocation on disk for these compressed pages,
486 * and will submit them to the elevator.
487 */
488 add_async_extent(async_cow, start, num_bytes,
489 total_compressed, pages, nr_pages_ret);
441 490
442 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 491 if (start + num_bytes < end) {
492 start += num_bytes;
493 pages = NULL;
494 cond_resched();
495 goto again;
496 }
497 } else {
498 /*
499 * No compression, but we still need to write the pages in
500 * the file we've been given so far. redirty the locked
501 * page if it corresponds to our extent and set things up
502 * for the async work queue to run cow_file_range to do
503 * the normal delalloc dance
504 */
505 if (page_offset(locked_page) >= start &&
506 page_offset(locked_page) <= end) {
507 __set_page_dirty_nobuffers(locked_page);
508 /* unlocked later on in the async handlers */
509 }
510 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
511 *num_added += 1;
512 }
443 513
444 while(disk_num_bytes > 0) { 514out:
445 unsigned long min_bytes; 515 return 0;
516
517free_pages_out:
518 for (i = 0; i < nr_pages_ret; i++) {
519 WARN_ON(pages[i]->mapping);
520 page_cache_release(pages[i]);
521 }
522 if (pages)
523 kfree(pages);
524
525 goto out;
526}
527
528/*
529 * phase two of compressed writeback. This is the ordered portion
530 * of the code, which only gets called in the order the work was
531 * queued. We walk all the async extents created by compress_file_range
532 * and send them down to the disk.
533 */
534static noinline int submit_compressed_extents(struct inode *inode,
535 struct async_cow *async_cow)
536{
537 struct async_extent *async_extent;
538 u64 alloc_hint = 0;
539 struct btrfs_trans_handle *trans;
540 struct btrfs_key ins;
541 struct extent_map *em;
542 struct btrfs_root *root = BTRFS_I(inode)->root;
543 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
544 struct extent_io_tree *io_tree;
545 int ret;
546
547 if (list_empty(&async_cow->extents))
548 return 0;
549
550 trans = btrfs_join_transaction(root, 1);
551
552 while(!list_empty(&async_cow->extents)) {
553 async_extent = list_entry(async_cow->extents.next,
554 struct async_extent, list);
555 list_del(&async_extent->list);
446 556
557 io_tree = &BTRFS_I(inode)->io_tree;
558
559 /* did the compression code fall back to uncompressed IO? */
560 if (!async_extent->pages) {
561 int page_started = 0;
562 unsigned long nr_written = 0;
563
564 lock_extent(io_tree, async_extent->start,
565 async_extent->start + async_extent->ram_size - 1,
566 GFP_NOFS);
567
568 /* allocate blocks */
569 cow_file_range(inode, async_cow->locked_page,
570 async_extent->start,
571 async_extent->start +
572 async_extent->ram_size - 1,
573 &page_started, &nr_written, 0);
574
575 /*
576 * if page_started, cow_file_range inserted an
577 * inline extent and took care of all the unlocking
578 * and IO for us. Otherwise, we need to submit
579 * all those pages down to the drive.
580 */
581 if (!page_started)
582 extent_write_locked_range(io_tree,
583 inode, async_extent->start,
584 async_extent->start +
585 async_extent->ram_size - 1,
586 btrfs_get_extent,
587 WB_SYNC_ALL);
588 kfree(async_extent);
589 cond_resched();
590 continue;
591 }
592
593 lock_extent(io_tree, async_extent->start,
594 async_extent->start + async_extent->ram_size - 1,
595 GFP_NOFS);
447 /* 596 /*
448 * the max size of a compressed extent is pretty small, 597 * here we're doing allocation and writeback of the
449 * make the code a little less complex by forcing 598 * compressed pages
450 * the allocator to find a whole compressed extent at once
451 */ 599 */
452 if (will_compress) 600 btrfs_drop_extent_cache(inode, async_extent->start,
453 min_bytes = disk_num_bytes; 601 async_extent->start +
454 else 602 async_extent->ram_size - 1, 0);
455 min_bytes = root->sectorsize; 603
604 ret = btrfs_reserve_extent(trans, root,
605 async_extent->compressed_size,
606 async_extent->compressed_size,
607 0, alloc_hint,
608 (u64)-1, &ins, 1);
609 BUG_ON(ret);
610 em = alloc_extent_map(GFP_NOFS);
611 em->start = async_extent->start;
612 em->len = async_extent->ram_size;
613
614 em->block_start = ins.objectid;
615 em->block_len = ins.offset;
616 em->bdev = root->fs_info->fs_devices->latest_bdev;
617 set_bit(EXTENT_FLAG_PINNED, &em->flags);
618 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
619
620 while(1) {
621 spin_lock(&em_tree->lock);
622 ret = add_extent_mapping(em_tree, em);
623 spin_unlock(&em_tree->lock);
624 if (ret != -EEXIST) {
625 free_extent_map(em);
626 break;
627 }
628 btrfs_drop_extent_cache(inode, async_extent->start,
629 async_extent->start +
630 async_extent->ram_size - 1, 0);
631 }
632
633 ret = btrfs_add_ordered_extent(inode, async_extent->start,
634 ins.objectid,
635 async_extent->ram_size,
636 ins.offset,
637 BTRFS_ORDERED_COMPRESSED);
638 BUG_ON(ret);
639
640 btrfs_end_transaction(trans, root);
641
642 /*
643 * clear dirty, set writeback and unlock the pages.
644 */
645 extent_clear_unlock_delalloc(inode,
646 &BTRFS_I(inode)->io_tree,
647 async_extent->start,
648 async_extent->start +
649 async_extent->ram_size - 1,
650 NULL, 1, 1, 0, 1, 1, 0);
651
652 ret = btrfs_submit_compressed_write(inode,
653 async_extent->start,
654 async_extent->ram_size,
655 ins.objectid,
656 ins.offset, async_extent->pages,
657 async_extent->nr_pages);
658
659 BUG_ON(ret);
660 trans = btrfs_join_transaction(root, 1);
661 alloc_hint = ins.objectid + ins.offset;
662 kfree(async_extent);
663 cond_resched();
664 }
665
666 btrfs_end_transaction(trans, root);
667 return 0;
668}
669
670/*
671 * when extent_io.c finds a delayed allocation range in the file,
672 * the call backs end up in this code. The basic idea is to
673 * allocate extents on disk for the range, and create ordered data structs
674 * in ram to track those extents.
675 *
676 * locked_page is the page that writepage had locked already. We use
677 * it to make sure we don't do extra locks or unlocks.
678 *
679 * *page_started is set to one if we unlock locked_page and do everything
680 * required to start IO on it. It may be clean and already done with
681 * IO when we return.
682 */
683static noinline int cow_file_range(struct inode *inode,
684 struct page *locked_page,
685 u64 start, u64 end, int *page_started,
686 unsigned long *nr_written,
687 int unlock)
688{
689 struct btrfs_root *root = BTRFS_I(inode)->root;
690 struct btrfs_trans_handle *trans;
691 u64 alloc_hint = 0;
692 u64 num_bytes;
693 unsigned long ram_size;
694 u64 disk_num_bytes;
695 u64 cur_alloc_size;
696 u64 blocksize = root->sectorsize;
697 u64 actual_end;
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
456 706
707 actual_end = min_t(u64, i_size_read(inode), end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while(disk_num_bytes > 0) {
457 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
458 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
459 min_bytes, 0, alloc_hint, 739 root->sectorsize, 0, alloc_hint,
460 (u64)-1, &ins, 1); 740 (u64)-1, &ins, 1);
461 if (ret) { 741 if (ret) {
462 WARN_ON(1); 742 BUG();
463 goto free_pages_out_fail;
464 } 743 }
465 em = alloc_extent_map(GFP_NOFS); 744 em = alloc_extent_map(GFP_NOFS);
466 em->start = start; 745 em->start = start;
467 746
468 if (will_compress) { 747 ram_size = ins.offset;
469 ram_size = num_bytes; 748 em->len = ins.offset;
470 em->len = num_bytes;
471 } else {
472 /* ramsize == disk size */
473 ram_size = ins.offset;
474 em->len = ins.offset;
475 }
476 749
477 em->block_start = ins.objectid; 750 em->block_start = ins.objectid;
478 em->block_len = ins.offset; 751 em->block_len = ins.offset;
479 em->bdev = root->fs_info->fs_devices->latest_bdev; 752 em->bdev = root->fs_info->fs_devices->latest_bdev;
480 set_bit(EXTENT_FLAG_PINNED, &em->flags); 753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
481 754
482 if (will_compress)
483 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
484
485 while(1) { 755 while(1) {
486 spin_lock(&em_tree->lock); 756 spin_lock(&em_tree->lock);
487 ret = add_extent_mapping(em_tree, em); 757 ret = add_extent_mapping(em_tree, em);
@@ -495,10 +765,8 @@ again:
495 } 765 }
496 766
497 cur_alloc_size = ins.offset; 767 cur_alloc_size = ins.offset;
498 ordered_type = will_compress ? BTRFS_ORDERED_COMPRESSED : 0;
499 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
500 ram_size, cur_alloc_size, 769 ram_size, cur_alloc_size, 0);
501 ordered_type);
502 BUG_ON(ret); 770 BUG_ON(ret);
503 771
504 if (disk_num_bytes < cur_alloc_size) { 772 if (disk_num_bytes < cur_alloc_size) {
@@ -506,82 +774,145 @@ again:
506 cur_alloc_size); 774 cur_alloc_size);
507 break; 775 break;
508 } 776 }
509
510 if (will_compress) {
511 /*
512 * we're doing compression, we and we need to
513 * submit the compressed extents down to the device.
514 *
515 * We lock down all the file pages, clearing their
516 * dirty bits and setting them writeback. Everyone
517 * that wants to modify the page will wait on the
518 * ordered extent above.
519 *
520 * The writeback bits on the file pages are
521 * cleared when the compressed pages are on disk
522 */
523 btrfs_end_transaction(trans, root);
524
525 if (start <= page_offset(locked_page) &&
526 page_offset(locked_page) < start + ram_size) {
527 *page_started = 1;
528 }
529
530 extent_clear_unlock_delalloc(inode,
531 &BTRFS_I(inode)->io_tree,
532 start,
533 start + ram_size - 1,
534 NULL, 1, 1, 0);
535
536 ret = btrfs_submit_compressed_write(inode, start,
537 ram_size, ins.objectid,
538 cur_alloc_size, pages,
539 nr_pages_ret);
540
541 BUG_ON(ret);
542 trans = btrfs_join_transaction(root, 1);
543 if (start + ram_size < end) {
544 start += ram_size;
545 alloc_hint = ins.objectid + ins.offset;
546 /* pages will be freed at end_bio time */
547 pages = NULL;
548 goto again;
549 } else {
550 /* we've written everything, time to go */
551 break;
552 }
553 }
554 /* we're not doing compressed IO, don't unlock the first 777 /* we're not doing compressed IO, don't unlock the first
555 * page (which the caller expects to stay locked), don't 778 * page (which the caller expects to stay locked), don't
556 * clear any dirty bits and don't set any writeback bits 779 * clear any dirty bits and don't set any writeback bits
557 */ 780 */
558 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 781 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
559 start, start + ram_size - 1, 782 start, start + ram_size - 1,
560 locked_page, 0, 0, 0); 783 locked_page, unlock, 1,
784 1, 0, 0, 0);
561 disk_num_bytes -= cur_alloc_size; 785 disk_num_bytes -= cur_alloc_size;
562 num_bytes -= cur_alloc_size; 786 num_bytes -= cur_alloc_size;
563 alloc_hint = ins.objectid + ins.offset; 787 alloc_hint = ins.objectid + ins.offset;
564 start += cur_alloc_size; 788 start += cur_alloc_size;
565 } 789 }
566
567 ret = 0;
568out: 790out:
791 ret = 0;
569 btrfs_end_transaction(trans, root); 792 btrfs_end_transaction(trans, root);
570 793
571 return ret; 794 return ret;
795}
572 796
573free_pages_out_fail: 797/*
574 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 798 * work queue call back to started compression on a file and pages
575 start, end, locked_page, 0, 0, 0); 799 */
576free_pages_out: 800static noinline void async_cow_start(struct btrfs_work *work)
577 for (i = 0; i < nr_pages_ret; i++) { 801{
578 WARN_ON(pages[i]->mapping); 802 struct async_cow *async_cow;
579 page_cache_release(pages[i]); 803 int num_added = 0;
804 async_cow = container_of(work, struct async_cow, work);
805
806 compress_file_range(async_cow->inode, async_cow->locked_page,
807 async_cow->start, async_cow->end, async_cow,
808 &num_added);
809 if (num_added == 0)
810 async_cow->inode = NULL;
811}
812
813/*
814 * work queue call back to submit previously compressed pages
815 */
816static noinline void async_cow_submit(struct btrfs_work *work)
817{
818 struct async_cow *async_cow;
819 struct btrfs_root *root;
820 unsigned long nr_pages;
821
822 async_cow = container_of(work, struct async_cow, work);
823
824 root = async_cow->root;
825 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
826 PAGE_CACHE_SHIFT;
827
828 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
829
830 if (atomic_read(&root->fs_info->async_delalloc_pages) <
831 5 * 1042 * 1024 &&
832 waitqueue_active(&root->fs_info->async_submit_wait))
833 wake_up(&root->fs_info->async_submit_wait);
834
835 if (async_cow->inode) {
836 submit_compressed_extents(async_cow->inode, async_cow);
580 } 837 }
581 if (pages) 838}
582 kfree(pages);
583 839
584 goto out; 840static noinline void async_cow_free(struct btrfs_work *work)
841{
842 struct async_cow *async_cow;
843 async_cow = container_of(work, struct async_cow, work);
844 kfree(async_cow);
845}
846
847static int cow_file_range_async(struct inode *inode, struct page *locked_page,
848 u64 start, u64 end, int *page_started,
849 unsigned long *nr_written)
850{
851 struct async_cow *async_cow;
852 struct btrfs_root *root = BTRFS_I(inode)->root;
853 unsigned long nr_pages;
854 u64 cur_end;
855 int limit = 10 * 1024 * 1042;
856
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while(start < end) {
865 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
866 async_cow->inode = inode;
867 async_cow->root = root;
868 async_cow->locked_page = locked_page;
869 async_cow->start = start;
870
871 if (btrfs_test_flag(inode, NOCOMPRESS))
872 cur_end = end;
873 else
874 cur_end = min(end, start + 512 * 1024 - 1);
875
876 async_cow->end = cur_end;
877 INIT_LIST_HEAD(&async_cow->extents);
878
879 async_cow->work.func = async_cow_start;
880 async_cow->work.ordered_func = async_cow_submit;
881 async_cow->work.ordered_free = async_cow_free;
882 async_cow->work.flags = 0;
883
884 while(atomic_read(&root->fs_info->async_submit_draining) &&
885 atomic_read(&root->fs_info->async_delalloc_pages)) {
886 wait_event(root->fs_info->async_submit_wait,
887 (atomic_read(&root->fs_info->async_delalloc_pages)
888 == 0));
889 }
890
891 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
892 PAGE_CACHE_SHIFT;
893 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
894
895 btrfs_queue_worker(&root->fs_info->delalloc_workers,
896 &async_cow->work);
897
898 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
899 wait_event(root->fs_info->async_submit_wait,
900 (atomic_read(&root->fs_info->async_delalloc_pages) <
901 limit));
902 }
903
904 while(atomic_read(&root->fs_info->async_submit_draining) &&
905 atomic_read(&root->fs_info->async_delalloc_pages)) {
906 wait_event(root->fs_info->async_submit_wait,
907 (atomic_read(&root->fs_info->async_delalloc_pages) ==
908 0));
909 }
910
911 *nr_written += nr_pages;
912 start = cur_end + 1;
913 }
914 *page_started = 1;
915 return 0;
585} 916}
586 917
587/* 918/*
@@ -592,7 +923,8 @@ free_pages_out:
592 * blocks on disk 923 * blocks on disk
593 */ 924 */
594static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 925static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
595 u64 start, u64 end, int *page_started, int force) 926 u64 start, u64 end, int *page_started, int force,
927 unsigned long *nr_written)
596{ 928{
597 struct btrfs_root *root = BTRFS_I(inode)->root; 929 struct btrfs_root *root = BTRFS_I(inode)->root;
598 struct btrfs_trans_handle *trans; 930 struct btrfs_trans_handle *trans;
@@ -711,7 +1043,8 @@ out_check:
711 btrfs_release_path(root, path); 1043 btrfs_release_path(root, path);
712 if (cow_start != (u64)-1) { 1044 if (cow_start != (u64)-1) {
713 ret = cow_file_range(inode, locked_page, cow_start, 1045 ret = cow_file_range(inode, locked_page, cow_start,
714 found_key.offset - 1, page_started); 1046 found_key.offset - 1, page_started,
1047 nr_written, 1);
715 BUG_ON(ret); 1048 BUG_ON(ret);
716 cow_start = (u64)-1; 1049 cow_start = (u64)-1;
717 } 1050 }
@@ -748,9 +1081,10 @@ out_check:
748 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1081 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
749 num_bytes, num_bytes, type); 1082 num_bytes, num_bytes, type);
750 BUG_ON(ret); 1083 BUG_ON(ret);
1084
751 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1085 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
752 cur_offset, cur_offset + num_bytes - 1, 1086 cur_offset, cur_offset + num_bytes - 1,
753 locked_page, 0, 0, 0); 1087 locked_page, 1, 1, 1, 0, 0, 0);
754 cur_offset = extent_end; 1088 cur_offset = extent_end;
755 if (cur_offset > end) 1089 if (cur_offset > end)
756 break; 1090 break;
@@ -761,7 +1095,7 @@ out_check:
761 cow_start = cur_offset; 1095 cow_start = cur_offset;
762 if (cow_start != (u64)-1) { 1096 if (cow_start != (u64)-1) {
763 ret = cow_file_range(inode, locked_page, cow_start, end, 1097 ret = cow_file_range(inode, locked_page, cow_start, end,
764 page_started); 1098 page_started, nr_written, 1);
765 BUG_ON(ret); 1099 BUG_ON(ret);
766 } 1100 }
767 1101
@@ -775,7 +1109,8 @@ out_check:
775 * extent_io.c call back to do delayed allocation processing 1109 * extent_io.c call back to do delayed allocation processing
776 */ 1110 */
777static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1111static int run_delalloc_range(struct inode *inode, struct page *locked_page,
778 u64 start, u64 end, int *page_started) 1112 u64 start, u64 end, int *page_started,
1113 unsigned long *nr_written)
779{ 1114{
780 struct btrfs_root *root = BTRFS_I(inode)->root; 1115 struct btrfs_root *root = BTRFS_I(inode)->root;
781 int ret; 1116 int ret;
@@ -783,13 +1118,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
783 if (btrfs_test_opt(root, NODATACOW) || 1118 if (btrfs_test_opt(root, NODATACOW) ||
784 btrfs_test_flag(inode, NODATACOW)) 1119 btrfs_test_flag(inode, NODATACOW))
785 ret = run_delalloc_nocow(inode, locked_page, start, end, 1120 ret = run_delalloc_nocow(inode, locked_page, start, end,
786 page_started, 0); 1121 page_started, 0, nr_written);
787 else if (btrfs_test_flag(inode, PREALLOC)) 1122 else if (btrfs_test_flag(inode, PREALLOC))
788 ret = run_delalloc_nocow(inode, locked_page, start, end, 1123 ret = run_delalloc_nocow(inode, locked_page, start, end,
789 page_started, 1); 1124 page_started, 1, nr_written);
790 else 1125 else
791 ret = cow_file_range(inode, locked_page, start, end, 1126 ret = cow_file_range_async(inode, locked_page, start, end,
792 page_started); 1127 page_started, nr_written);
793 1128
794 return ret; 1129 return ret;
795} 1130}
@@ -861,6 +1196,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
861 u64 map_length; 1196 u64 map_length;
862 int ret; 1197 int ret;
863 1198
1199 if (bio_flags & EXTENT_BIO_COMPRESSED)
1200 return 0;
1201
864 length = bio->bi_size; 1202 length = bio->bi_size;
865 map_tree = &root->fs_info->mapping_tree; 1203 map_tree = &root->fs_info->mapping_tree;
866 map_length = length; 1204 map_length = length;
@@ -925,12 +1263,12 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
925 btrfs_test_flag(inode, NODATASUM); 1263 btrfs_test_flag(inode, NODATASUM);
926 1264
927 if (!(rw & (1 << BIO_RW))) { 1265 if (!(rw & (1 << BIO_RW))) {
928 if (!skip_sum)
929 btrfs_lookup_bio_sums(root, inode, bio);
930 1266
931 if (bio_flags & EXTENT_BIO_COMPRESSED) 1267 if (bio_flags & EXTENT_BIO_COMPRESSED)
932 return btrfs_submit_compressed_read(inode, bio, 1268 return btrfs_submit_compressed_read(inode, bio,
933 mirror_num, bio_flags); 1269 mirror_num, bio_flags);
1270 else if (!skip_sum)
1271 btrfs_lookup_bio_sums(root, inode, bio);
934 goto mapit; 1272 goto mapit;
935 } else if (!skip_sum) { 1273 } else if (!skip_sum) {
936 /* we're doing a write, do the async checksumming */ 1274 /* we're doing a write, do the async checksumming */
@@ -966,6 +1304,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
966 1304
967int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1305int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
968{ 1306{
1307 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
1308 WARN_ON(1);
1309 }
969 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1310 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
970 GFP_NOFS); 1311 GFP_NOFS);
971} 1312}
@@ -2105,6 +2446,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2105 int pending_del_nr = 0; 2446 int pending_del_nr = 0;
2106 int pending_del_slot = 0; 2447 int pending_del_slot = 0;
2107 int extent_type = -1; 2448 int extent_type = -1;
2449 int encoding;
2108 u64 mask = root->sectorsize - 1; 2450 u64 mask = root->sectorsize - 1;
2109 2451
2110 if (root->ref_cows) 2452 if (root->ref_cows)
@@ -2144,6 +2486,7 @@ search_again:
2144 leaf = path->nodes[0]; 2486 leaf = path->nodes[0];
2145 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2487 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2146 found_type = btrfs_key_type(&found_key); 2488 found_type = btrfs_key_type(&found_key);
2489 encoding = 0;
2147 2490
2148 if (found_key.objectid != inode->i_ino) 2491 if (found_key.objectid != inode->i_ino)
2149 break; 2492 break;
@@ -2156,6 +2499,10 @@ search_again:
2156 fi = btrfs_item_ptr(leaf, path->slots[0], 2499 fi = btrfs_item_ptr(leaf, path->slots[0],
2157 struct btrfs_file_extent_item); 2500 struct btrfs_file_extent_item);
2158 extent_type = btrfs_file_extent_type(leaf, fi); 2501 extent_type = btrfs_file_extent_type(leaf, fi);
2502 encoding = btrfs_file_extent_compression(leaf, fi);
2503 encoding |= btrfs_file_extent_encryption(leaf, fi);
2504 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2505
2159 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2506 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2160 item_end += 2507 item_end +=
2161 btrfs_file_extent_num_bytes(leaf, fi); 2508 btrfs_file_extent_num_bytes(leaf, fi);
@@ -2200,7 +2547,7 @@ search_again:
2200 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2547 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2201 u64 num_dec; 2548 u64 num_dec;
2202 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 2549 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2203 if (!del_item) { 2550 if (!del_item && !encoding) {
2204 u64 orig_num_bytes = 2551 u64 orig_num_bytes =
2205 btrfs_file_extent_num_bytes(leaf, fi); 2552 btrfs_file_extent_num_bytes(leaf, fi);
2206 extent_num_bytes = new_size - 2553 extent_num_bytes = new_size -
@@ -2436,7 +2783,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2436 last_byte = min(extent_map_end(em), block_end); 2783 last_byte = min(extent_map_end(em), block_end);
2437 last_byte = (last_byte + mask) & ~mask; 2784 last_byte = (last_byte + mask) & ~mask;
2438 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2785 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2786 u64 hint_byte = 0;
2439 hole_size = last_byte - cur_offset; 2787 hole_size = last_byte - cur_offset;
2788 err = btrfs_drop_extents(trans, root, inode,
2789 cur_offset,
2790 cur_offset + hole_size,
2791 cur_offset, &hint_byte);
2792 if (err)
2793 break;
2440 err = btrfs_insert_file_extent(trans, root, 2794 err = btrfs_insert_file_extent(trans, root,
2441 inode->i_ino, cur_offset, 0, 2795 inode->i_ino, cur_offset, 0,
2442 0, hole_size, 0, hole_size, 2796 0, hole_size, 0, hole_size,
@@ -3785,6 +4139,7 @@ int btrfs_writepages(struct address_space *mapping,
3785 struct writeback_control *wbc) 4139 struct writeback_control *wbc)
3786{ 4140{
3787 struct extent_io_tree *tree; 4141 struct extent_io_tree *tree;
4142
3788 tree = &BTRFS_I(mapping->host)->io_tree; 4143 tree = &BTRFS_I(mapping->host)->io_tree;
3789 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 4144 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
3790} 4145}
@@ -4285,9 +4640,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4285 * ordered extents get created before we return 4640 * ordered extents get created before we return
4286 */ 4641 */
4287 atomic_inc(&root->fs_info->async_submit_draining); 4642 atomic_inc(&root->fs_info->async_submit_draining);
4288 while(atomic_read(&root->fs_info->nr_async_submits)) { 4643 while(atomic_read(&root->fs_info->nr_async_submits) ||
4644 atomic_read(&root->fs_info->async_delalloc_pages)) {
4289 wait_event(root->fs_info->async_submit_wait, 4645 wait_event(root->fs_info->async_submit_wait,
4290 (atomic_read(&root->fs_info->nr_async_submits) == 0)); 4646 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4647 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4291 } 4648 }
4292 atomic_dec(&root->fs_info->async_submit_draining); 4649 atomic_dec(&root->fs_info->async_submit_draining);
4293 return 0; 4650 return 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 370bb4285597..027ad6b3839e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -390,7 +390,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
390 * start IO on any dirty ones so the wait doesn't stall waiting 390 * start IO on any dirty ones so the wait doesn't stall waiting
391 * for pdflush to find them 391 * for pdflush to find them
392 */ 392 */
393 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); 393 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
394 if (wait) { 394 if (wait) {
395 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 395 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
396 &entry->flags)); 396 &entry->flags));
@@ -421,6 +421,12 @@ again:
421 */ 421 */
422 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 422 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
423 423
424 /* The compression code will leave pages locked but return from
425 * writepage without setting the page writeback. Starting again
426 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
427 */
428 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
429
424 btrfs_wait_on_page_writeback_range(inode->i_mapping, 430 btrfs_wait_on_page_writeback_range(inode->i_mapping,
425 start >> PAGE_CACHE_SHIFT, 431 start >> PAGE_CACHE_SHIFT,
426 orig_end >> PAGE_CACHE_SHIFT); 432 orig_end >> PAGE_CACHE_SHIFT);
@@ -448,10 +454,7 @@ again:
448 } 454 }
449 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, 455 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
450 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { 456 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
451 printk("inode %lu still ordered or delalloc after wait " 457 schedule_timeout(1);
452 "%llu %llu\n", inode->i_ino,
453 (unsigned long long)start,
454 (unsigned long long)orig_end);
455 goto again; 458 goto again;
456 } 459 }
457 return 0; 460 return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 431fdf144b58..ab9d5e89ed13 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -375,6 +375,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
375 filemap_flush(root->fs_info->btree_inode->i_mapping); 375 filemap_flush(root->fs_info->btree_inode->i_mapping);
376 return 0; 376 return 0;
377 } 377 }
378
379 btrfs_start_delalloc_inodes(root);
380 btrfs_wait_ordered_extents(root, 0);
381
378 btrfs_clean_old_snapshots(root); 382 btrfs_clean_old_snapshots(root);
379 trans = btrfs_start_transaction(root, 1); 383 trans = btrfs_start_transaction(root, 1);
380 ret = btrfs_commit_transaction(trans, root); 384 ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index e99309180a11..ba2527d08734 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -423,8 +423,9 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
423 /* we didn't make progress in this inflate 423 /* we didn't make progress in this inflate
424 * call, we're done 424 * call, we're done
425 */ 425 */
426 if (ret != Z_STREAM_END) 426 if (ret != Z_STREAM_END) {
427 ret = -1; 427 ret = -1;
428 }
428 break; 429 break;
429 } 430 }
430 431