aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/compression.c150
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c27
-rw-r--r--fs/btrfs/extent-tree.c6
-rw-r--r--fs/btrfs/extent_io.c140
-rw-r--r--fs/btrfs/extent_io.h13
-rw-r--r--fs/btrfs/file.c53
-rw-r--r--fs/btrfs/inode.c643
-rw-r--r--fs/btrfs/ordered-data.c13
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/zlib.c3
11 files changed, 849 insertions, 207 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 354913177ba6..284f21025bcc 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,7 @@
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h> 35#include <linux/version.h>
36#include <linux/pagevec.h>
36#include "ctree.h" 37#include "ctree.h"
37#include "disk-io.h" 38#include "disk-io.h"
38#include "transaction.h" 39#include "transaction.h"
@@ -145,9 +146,9 @@ static void end_compressed_bio_read(struct bio *bio, int err)
145 } 146 }
146 147
147 /* do io completion on the original bio */ 148 /* do io completion on the original bio */
148 if (cb->errors) 149 if (cb->errors) {
149 bio_io_error(cb->orig_bio); 150 bio_io_error(cb->orig_bio);
150 else 151 } else
151 bio_endio(cb->orig_bio, 0); 152 bio_endio(cb->orig_bio, 0);
152 153
153 /* finally free the cb struct */ 154 /* finally free the cb struct */
@@ -333,6 +334,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
333 } 334 }
334 bytes_left -= PAGE_CACHE_SIZE; 335 bytes_left -= PAGE_CACHE_SIZE;
335 first_byte += PAGE_CACHE_SIZE; 336 first_byte += PAGE_CACHE_SIZE;
337 cond_resched();
336 } 338 }
337 bio_get(bio); 339 bio_get(bio);
338 340
@@ -346,6 +348,130 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
346 return 0; 348 return 0;
347} 349}
348 350
351static noinline int add_ra_bio_pages(struct inode *inode,
352 u64 compressed_end,
353 struct compressed_bio *cb)
354{
355 unsigned long end_index;
356 unsigned long page_index;
357 u64 last_offset;
358 u64 isize = i_size_read(inode);
359 int ret;
360 struct page *page;
361 unsigned long nr_pages = 0;
362 struct extent_map *em;
363 struct address_space *mapping = inode->i_mapping;
364 struct pagevec pvec;
365 struct extent_map_tree *em_tree;
366 struct extent_io_tree *tree;
367 u64 end;
368 int misses = 0;
369
370 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
371 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
372 em_tree = &BTRFS_I(inode)->extent_tree;
373 tree = &BTRFS_I(inode)->io_tree;
374
375 if (isize == 0)
376 return 0;
377
378 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
379
380 pagevec_init(&pvec, 0);
381 while(last_offset < compressed_end) {
382 page_index = last_offset >> PAGE_CACHE_SHIFT;
383
384 if (page_index > end_index)
385 break;
386
387 rcu_read_lock();
388 page = radix_tree_lookup(&mapping->page_tree, page_index);
389 rcu_read_unlock();
390 if (page) {
391 misses++;
392 if (misses > 4)
393 break;
394 goto next;
395 }
396
397 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
398 if (!page)
399 break;
400
401 page->index = page_index;
402 /*
403 * what we want to do here is call add_to_page_cache_lru,
404 * but that isn't exported, so we reproduce it here
405 */
406 if (add_to_page_cache(page, mapping,
407 page->index, GFP_NOFS)) {
408 page_cache_release(page);
409 goto next;
410 }
411
412 /* open coding of lru_cache_add, also not exported */
413 page_cache_get(page);
414 if (!pagevec_add(&pvec, page))
415 __pagevec_lru_add(&pvec);
416
417 end = last_offset + PAGE_CACHE_SIZE - 1;
418 /*
419 * at this point, we have a locked page in the page cache
420 * for these bytes in the file. But, we have to make
421 * sure they map to this compressed extent on disk.
422 */
423 set_page_extent_mapped(page);
424 lock_extent(tree, last_offset, end, GFP_NOFS);
425 spin_lock(&em_tree->lock);
426 em = lookup_extent_mapping(em_tree, last_offset,
427 PAGE_CACHE_SIZE);
428 spin_unlock(&em_tree->lock);
429
430 if (!em || last_offset < em->start ||
431 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
432 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
433 free_extent_map(em);
434 unlock_extent(tree, last_offset, end, GFP_NOFS);
435 unlock_page(page);
436 page_cache_release(page);
437 break;
438 }
439 free_extent_map(em);
440
441 if (page->index == end_index) {
442 char *userpage;
443 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
444
445 if (zero_offset) {
446 int zeros;
447 zeros = PAGE_CACHE_SIZE - zero_offset;
448 userpage = kmap_atomic(page, KM_USER0);
449 memset(userpage + zero_offset, 0, zeros);
450 flush_dcache_page(page);
451 kunmap_atomic(userpage, KM_USER0);
452 }
453 }
454
455 ret = bio_add_page(cb->orig_bio, page,
456 PAGE_CACHE_SIZE, 0);
457
458 if (ret == PAGE_CACHE_SIZE) {
459 nr_pages++;
460 page_cache_release(page);
461 } else {
462 unlock_extent(tree, last_offset, end, GFP_NOFS);
463 unlock_page(page);
464 page_cache_release(page);
465 break;
466 }
467next:
468 last_offset += PAGE_CACHE_SIZE;
469 }
470 if (pagevec_count(&pvec))
471 __pagevec_lru_add(&pvec);
472 return 0;
473}
474
349/* 475/*
350 * for a compressed read, the bio we get passed has all the inode pages 476 * for a compressed read, the bio we get passed has all the inode pages
351 * in it. We don't actually do IO on those pages but allocate new ones 477 * in it. We don't actually do IO on those pages but allocate new ones
@@ -373,6 +499,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
373 struct block_device *bdev; 499 struct block_device *bdev;
374 struct bio *comp_bio; 500 struct bio *comp_bio;
375 u64 cur_disk_byte = (u64)bio->bi_sector << 9; 501 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
502 u64 em_len;
376 struct extent_map *em; 503 struct extent_map *em;
377 int ret; 504 int ret;
378 505
@@ -393,6 +520,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
393 520
394 cb->start = em->start; 521 cb->start = em->start;
395 compressed_len = em->block_len; 522 compressed_len = em->block_len;
523 em_len = em->len;
396 free_extent_map(em); 524 free_extent_map(em);
397 525
398 cb->len = uncompressed_len; 526 cb->len = uncompressed_len;
@@ -411,6 +539,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
411 } 539 }
412 cb->nr_pages = nr_pages; 540 cb->nr_pages = nr_pages;
413 541
542 add_ra_bio_pages(inode, cb->start + em_len, cb);
543
544 if (!btrfs_test_opt(root, NODATASUM) &&
545 !btrfs_test_flag(inode, NODATASUM)) {
546 btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
547 }
548
549 /* include any pages we added in add_ra-bio_pages */
550 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
551 cb->len = uncompressed_len;
552
414 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 553 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
415 comp_bio->bi_private = cb; 554 comp_bio->bi_private = cb;
416 comp_bio->bi_end_io = end_compressed_bio_read; 555 comp_bio->bi_end_io = end_compressed_bio_read;
@@ -442,9 +581,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
442 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, 581 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
443 GFP_NOFS); 582 GFP_NOFS);
444 atomic_inc(&cb->pending_bios); 583 atomic_inc(&cb->pending_bios);
445 bio->bi_private = cb; 584 comp_bio->bi_private = cb;
446 bio->bi_end_io = end_compressed_bio_write; 585 comp_bio->bi_end_io = end_compressed_bio_read;
447 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 586
587 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
448 } 588 }
449 cur_disk_byte += PAGE_CACHE_SIZE; 589 cur_disk_byte += PAGE_CACHE_SIZE;
450 } 590 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 689df070c8e9..c83cc5b2ded7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -625,8 +625,8 @@ struct btrfs_fs_info {
625 struct btrfs_transaction *running_transaction; 625 struct btrfs_transaction *running_transaction;
626 wait_queue_head_t transaction_throttle; 626 wait_queue_head_t transaction_throttle;
627 wait_queue_head_t transaction_wait; 627 wait_queue_head_t transaction_wait;
628 wait_queue_head_t async_submit_wait;
629 628
629 wait_queue_head_t async_submit_wait;
630 wait_queue_head_t tree_log_wait; 630 wait_queue_head_t tree_log_wait;
631 631
632 struct btrfs_super_block super_copy; 632 struct btrfs_super_block super_copy;
@@ -653,6 +653,7 @@ struct btrfs_fs_info {
653 atomic_t nr_async_submits; 653 atomic_t nr_async_submits;
654 atomic_t async_submit_draining; 654 atomic_t async_submit_draining;
655 atomic_t nr_async_bios; 655 atomic_t nr_async_bios;
656 atomic_t async_delalloc_pages;
656 atomic_t tree_log_writers; 657 atomic_t tree_log_writers;
657 atomic_t tree_log_commit; 658 atomic_t tree_log_commit;
658 unsigned long tree_log_batch; 659 unsigned long tree_log_batch;
@@ -677,6 +678,7 @@ struct btrfs_fs_info {
677 * two 678 * two
678 */ 679 */
679 struct btrfs_workers workers; 680 struct btrfs_workers workers;
681 struct btrfs_workers delalloc_workers;
680 struct btrfs_workers endio_workers; 682 struct btrfs_workers endio_workers;
681 struct btrfs_workers endio_write_workers; 683 struct btrfs_workers endio_write_workers;
682 struct btrfs_workers submit_workers; 684 struct btrfs_workers submit_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e0a28f705a64..8efc123d222b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -539,6 +539,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
539 (atomic_read(&fs_info->nr_async_bios) < limit), 539 (atomic_read(&fs_info->nr_async_bios) < limit),
540 HZ/10); 540 HZ/10);
541 } 541 }
542
543 while(atomic_read(&fs_info->async_submit_draining) &&
544 atomic_read(&fs_info->nr_async_submits)) {
545 wait_event(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_submits) == 0));
547 }
548
542 return 0; 549 return 0;
543} 550}
544 551
@@ -1437,6 +1444,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1437 INIT_LIST_HEAD(&fs_info->space_info); 1444 INIT_LIST_HEAD(&fs_info->space_info);
1438 btrfs_mapping_init(&fs_info->mapping_tree); 1445 btrfs_mapping_init(&fs_info->mapping_tree);
1439 atomic_set(&fs_info->nr_async_submits, 0); 1446 atomic_set(&fs_info->nr_async_submits, 0);
1447 atomic_set(&fs_info->async_delalloc_pages, 0);
1440 atomic_set(&fs_info->async_submit_draining, 0); 1448 atomic_set(&fs_info->async_submit_draining, 0);
1441 atomic_set(&fs_info->nr_async_bios, 0); 1449 atomic_set(&fs_info->nr_async_bios, 0);
1442 atomic_set(&fs_info->throttles, 0); 1450 atomic_set(&fs_info->throttles, 0);
@@ -1550,6 +1558,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1550 btrfs_init_workers(&fs_info->workers, "worker", 1558 btrfs_init_workers(&fs_info->workers, "worker",
1551 fs_info->thread_pool_size); 1559 fs_info->thread_pool_size);
1552 1560
1561 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1562 fs_info->thread_pool_size);
1563
1553 btrfs_init_workers(&fs_info->submit_workers, "submit", 1564 btrfs_init_workers(&fs_info->submit_workers, "submit",
1554 min_t(u64, fs_devices->num_devices, 1565 min_t(u64, fs_devices->num_devices,
1555 fs_info->thread_pool_size)); 1566 fs_info->thread_pool_size));
@@ -1560,15 +1571,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1560 */ 1571 */
1561 fs_info->submit_workers.idle_thresh = 64; 1572 fs_info->submit_workers.idle_thresh = 64;
1562 1573
1563 /* fs_info->workers is responsible for checksumming file data 1574 fs_info->workers.idle_thresh = 16;
1564 * blocks and metadata. Using a larger idle thresh allows each
1565 * worker thread to operate on things in roughly the order they
1566 * were sent by the writeback daemons, improving overall locality
1567 * of the IO going down the pipe.
1568 */
1569 fs_info->workers.idle_thresh = 8;
1570 fs_info->workers.ordered = 1; 1575 fs_info->workers.ordered = 1;
1571 1576
1577 fs_info->delalloc_workers.idle_thresh = 2;
1578 fs_info->delalloc_workers.ordered = 1;
1579
1572 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1580 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1573 btrfs_init_workers(&fs_info->endio_workers, "endio", 1581 btrfs_init_workers(&fs_info->endio_workers, "endio",
1574 fs_info->thread_pool_size); 1582 fs_info->thread_pool_size);
@@ -1584,6 +1592,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1584 1592
1585 btrfs_start_workers(&fs_info->workers, 1); 1593 btrfs_start_workers(&fs_info->workers, 1);
1586 btrfs_start_workers(&fs_info->submit_workers, 1); 1594 btrfs_start_workers(&fs_info->submit_workers, 1);
1595 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1587 btrfs_start_workers(&fs_info->fixup_workers, 1); 1596 btrfs_start_workers(&fs_info->fixup_workers, 1);
1588 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1597 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1589 btrfs_start_workers(&fs_info->endio_write_workers, 1598 btrfs_start_workers(&fs_info->endio_write_workers,
@@ -1732,6 +1741,7 @@ fail_tree_root:
1732fail_sys_array: 1741fail_sys_array:
1733fail_sb_buffer: 1742fail_sb_buffer:
1734 btrfs_stop_workers(&fs_info->fixup_workers); 1743 btrfs_stop_workers(&fs_info->fixup_workers);
1744 btrfs_stop_workers(&fs_info->delalloc_workers);
1735 btrfs_stop_workers(&fs_info->workers); 1745 btrfs_stop_workers(&fs_info->workers);
1736 btrfs_stop_workers(&fs_info->endio_workers); 1746 btrfs_stop_workers(&fs_info->endio_workers);
1737 btrfs_stop_workers(&fs_info->endio_write_workers); 1747 btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1988,6 +1998,7 @@ int close_ctree(struct btrfs_root *root)
1988 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); 1998 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
1989 1999
1990 btrfs_stop_workers(&fs_info->fixup_workers); 2000 btrfs_stop_workers(&fs_info->fixup_workers);
2001 btrfs_stop_workers(&fs_info->delalloc_workers);
1991 btrfs_stop_workers(&fs_info->workers); 2002 btrfs_stop_workers(&fs_info->workers);
1992 btrfs_stop_workers(&fs_info->endio_workers); 2003 btrfs_stop_workers(&fs_info->endio_workers);
1993 btrfs_stop_workers(&fs_info->endio_write_workers); 2004 btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -2062,7 +2073,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2062 struct extent_io_tree *tree; 2073 struct extent_io_tree *tree;
2063 u64 num_dirty; 2074 u64 num_dirty;
2064 u64 start = 0; 2075 u64 start = 0;
2065 unsigned long thresh = 96 * 1024 * 1024; 2076 unsigned long thresh = 32 * 1024 * 1024;
2066 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 2077 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2067 2078
2068 if (current_is_pdflush() || current->flags & PF_MEMALLOC) 2079 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8af39521eb71..ebd8275a1934 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -768,7 +768,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
768 l = path->nodes[0]; 768 l = path->nodes[0];
769 769
770 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
771 BUG_ON(key.objectid != bytenr); 771 if (key.objectid != bytenr) {
772 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
773 printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
774 BUG();
775 }
772 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 776 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
773 777
774 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 778 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9b37ce6e5168..bbe3bcfcf4ae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -47,6 +47,11 @@ struct extent_page_data {
47 struct bio *bio; 47 struct bio *bio;
48 struct extent_io_tree *tree; 48 struct extent_io_tree *tree;
49 get_extent_t *get_extent; 49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
50}; 55};
51 56
52int __init extent_io_init(void) 57int __init extent_io_init(void)
@@ -1198,11 +1203,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
1198 * the caller is taking responsibility for 1203 * the caller is taking responsibility for
1199 * locked_page 1204 * locked_page
1200 */ 1205 */
1201 if (pages[i] != locked_page) 1206 if (pages[i] != locked_page) {
1202 lock_page(pages[i]); 1207 lock_page(pages[i]);
1208 if (pages[i]->mapping != inode->i_mapping) {
1209 ret = -EAGAIN;
1210 unlock_page(pages[i]);
1211 page_cache_release(pages[i]);
1212 goto done;
1213 }
1214 }
1203 page_cache_release(pages[i]); 1215 page_cache_release(pages[i]);
1216 pages_locked++;
1204 } 1217 }
1205 pages_locked += ret;
1206 nrpages -= ret; 1218 nrpages -= ret;
1207 index += ret; 1219 index += ret;
1208 cond_resched(); 1220 cond_resched();
@@ -1262,8 +1274,7 @@ again:
1262 * if we're looping. 1274 * if we're looping.
1263 */ 1275 */
1264 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { 1276 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1265 delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & 1277 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1266 ~((u64)PAGE_CACHE_SIZE - 1);
1267 } 1278 }
1268 /* step two, lock all the pages after the page that has start */ 1279 /* step two, lock all the pages after the page that has start */
1269 ret = lock_delalloc_pages(inode, locked_page, 1280 ret = lock_delalloc_pages(inode, locked_page,
@@ -1306,7 +1317,10 @@ out_failed:
1306int extent_clear_unlock_delalloc(struct inode *inode, 1317int extent_clear_unlock_delalloc(struct inode *inode,
1307 struct extent_io_tree *tree, 1318 struct extent_io_tree *tree,
1308 u64 start, u64 end, struct page *locked_page, 1319 u64 start, u64 end, struct page *locked_page,
1309 int clear_dirty, int set_writeback, 1320 int unlock_pages,
1321 int clear_unlock,
1322 int clear_delalloc, int clear_dirty,
1323 int set_writeback,
1310 int end_writeback) 1324 int end_writeback)
1311{ 1325{
1312 int ret; 1326 int ret;
@@ -1315,12 +1329,19 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1315 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1329 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1316 unsigned long nr_pages = end_index - index + 1; 1330 unsigned long nr_pages = end_index - index + 1;
1317 int i; 1331 int i;
1318 int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; 1332 int clear_bits = 0;
1319 1333
1334 if (clear_unlock)
1335 clear_bits |= EXTENT_LOCKED;
1320 if (clear_dirty) 1336 if (clear_dirty)
1321 clear_bits |= EXTENT_DIRTY; 1337 clear_bits |= EXTENT_DIRTY;
1322 1338
1339 if (clear_delalloc)
1340 clear_bits |= EXTENT_DELALLOC;
1341
1323 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); 1342 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1343 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1344 return 0;
1324 1345
1325 while(nr_pages > 0) { 1346 while(nr_pages > 0) {
1326 ret = find_get_pages_contig(inode->i_mapping, index, 1347 ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1336,7 +1357,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1336 set_page_writeback(pages[i]); 1357 set_page_writeback(pages[i]);
1337 if (end_writeback) 1358 if (end_writeback)
1338 end_page_writeback(pages[i]); 1359 end_page_writeback(pages[i]);
1339 unlock_page(pages[i]); 1360 if (unlock_pages)
1361 unlock_page(pages[i]);
1340 page_cache_release(pages[i]); 1362 page_cache_release(pages[i]);
1341 } 1363 }
1342 nr_pages -= ret; 1364 nr_pages -= ret;
@@ -1741,9 +1763,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1741 } 1763 }
1742 } 1764 }
1743 1765
1744 if (uptodate) 1766 if (uptodate) {
1745 set_extent_uptodate(tree, start, end, 1767 set_extent_uptodate(tree, start, end,
1746 GFP_ATOMIC); 1768 GFP_ATOMIC);
1769 }
1747 unlock_extent(tree, start, end, GFP_ATOMIC); 1770 unlock_extent(tree, start, end, GFP_ATOMIC);
1748 1771
1749 if (whole_page) { 1772 if (whole_page) {
@@ -1925,6 +1948,7 @@ void set_page_extent_mapped(struct page *page)
1925 set_page_private(page, EXTENT_PAGE_PRIVATE); 1948 set_page_private(page, EXTENT_PAGE_PRIVATE);
1926 } 1949 }
1927} 1950}
1951EXPORT_SYMBOL(set_page_extent_mapped);
1928 1952
1929void set_page_extent_head(struct page *page, unsigned long len) 1953void set_page_extent_head(struct page *page, unsigned long len)
1930{ 1954{
@@ -2143,12 +2167,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2143 u64 delalloc_end; 2167 u64 delalloc_end;
2144 int page_started; 2168 int page_started;
2145 int compressed; 2169 int compressed;
2170 unsigned long nr_written = 0;
2146 2171
2147 WARN_ON(!PageLocked(page)); 2172 WARN_ON(!PageLocked(page));
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2173 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2174 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2175 (page->index == end_index && !pg_offset)) {
2151 page->mapping->a_ops->invalidatepage(page, 0); 2176 if (epd->extent_locked) {
2177 if (tree->ops && tree->ops->writepage_end_io_hook)
2178 tree->ops->writepage_end_io_hook(page, start,
2179 page_end, NULL, 1);
2180 }
2152 unlock_page(page); 2181 unlock_page(page);
2153 return 0; 2182 return 0;
2154 } 2183 }
@@ -2169,27 +2198,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2169 delalloc_start = start; 2198 delalloc_start = start;
2170 delalloc_end = 0; 2199 delalloc_end = 0;
2171 page_started = 0; 2200 page_started = 0;
2172 while(delalloc_end < page_end) { 2201 if (!epd->extent_locked) {
2173 nr_delalloc = find_lock_delalloc_range(inode, tree, 2202 while(delalloc_end < page_end) {
2203 nr_delalloc = find_lock_delalloc_range(inode, tree,
2174 page, 2204 page,
2175 &delalloc_start, 2205 &delalloc_start,
2176 &delalloc_end, 2206 &delalloc_end,
2177 128 * 1024 * 1024); 2207 128 * 1024 * 1024);
2178 if (nr_delalloc == 0) { 2208 if (nr_delalloc == 0) {
2209 delalloc_start = delalloc_end + 1;
2210 continue;
2211 }
2212 tree->ops->fill_delalloc(inode, page, delalloc_start,
2213 delalloc_end, &page_started,
2214 &nr_written);
2179 delalloc_start = delalloc_end + 1; 2215 delalloc_start = delalloc_end + 1;
2180 continue;
2181 } 2216 }
2182 tree->ops->fill_delalloc(inode, page, delalloc_start,
2183 delalloc_end, &page_started);
2184 delalloc_start = delalloc_end + 1;
2185 }
2186 2217
2187 /* did the fill delalloc function already unlock and start the IO? */ 2218 /* did the fill delalloc function already unlock and start
2188 if (page_started) { 2219 * the IO?
2189 return 0; 2220 */
2221 if (page_started) {
2222 ret = 0;
2223 goto update_nr_written;
2224 }
2190 } 2225 }
2191
2192 lock_extent(tree, start, page_end, GFP_NOFS); 2226 lock_extent(tree, start, page_end, GFP_NOFS);
2227
2193 unlock_start = start; 2228 unlock_start = start;
2194 2229
2195 if (tree->ops && tree->ops->writepage_start_hook) { 2230 if (tree->ops && tree->ops->writepage_start_hook) {
@@ -2199,10 +2234,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2199 unlock_extent(tree, start, page_end, GFP_NOFS); 2234 unlock_extent(tree, start, page_end, GFP_NOFS);
2200 redirty_page_for_writepage(wbc, page); 2235 redirty_page_for_writepage(wbc, page);
2201 unlock_page(page); 2236 unlock_page(page);
2202 return 0; 2237 ret = 0;
2238 goto update_nr_written;
2203 } 2239 }
2204 } 2240 }
2205 2241
2242 nr_written++;
2243
2206 end = page_end; 2244 end = page_end;
2207 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 2245 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
2208 printk("found delalloc bits after lock_extent\n"); 2246 printk("found delalloc bits after lock_extent\n");
@@ -2333,6 +2371,12 @@ done:
2333 if (unlock_start <= page_end) 2371 if (unlock_start <= page_end)
2334 unlock_extent(tree, unlock_start, page_end, GFP_NOFS); 2372 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2335 unlock_page(page); 2373 unlock_page(page);
2374
2375update_nr_written:
2376 wbc->nr_to_write -= nr_written;
2377 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2378 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2379 page->mapping->writeback_index = page->index + nr_written;
2336 return 0; 2380 return 0;
2337} 2381}
2338 2382
@@ -2431,7 +2475,7 @@ retry:
2431 unlock_page(page); 2475 unlock_page(page);
2432 ret = 0; 2476 ret = 0;
2433 } 2477 }
2434 if (ret || (--(wbc->nr_to_write) <= 0)) 2478 if (ret || wbc->nr_to_write <= 0)
2435 done = 1; 2479 done = 1;
2436 if (wbc->nonblocking && bdi_write_congested(bdi)) { 2480 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2437 wbc->encountered_congestion = 1; 2481 wbc->encountered_congestion = 1;
@@ -2452,6 +2496,8 @@ retry:
2452 } 2496 }
2453 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2497 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2454 mapping->writeback_index = index; 2498 mapping->writeback_index = index;
2499 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2500 range_whole = 1;
2455 2501
2456 if (wbc->range_cont) 2502 if (wbc->range_cont)
2457 wbc->range_start = index << PAGE_CACHE_SHIFT; 2503 wbc->range_start = index << PAGE_CACHE_SHIFT;
@@ -2469,6 +2515,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2469 .bio = NULL, 2515 .bio = NULL,
2470 .tree = tree, 2516 .tree = tree,
2471 .get_extent = get_extent, 2517 .get_extent = get_extent,
2518 .extent_locked = 0,
2472 }; 2519 };
2473 struct writeback_control wbc_writepages = { 2520 struct writeback_control wbc_writepages = {
2474 .bdi = wbc->bdi, 2521 .bdi = wbc->bdi,
@@ -2491,6 +2538,52 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2491} 2538}
2492EXPORT_SYMBOL(extent_write_full_page); 2539EXPORT_SYMBOL(extent_write_full_page);
2493 2540
2541int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2542 u64 start, u64 end, get_extent_t *get_extent,
2543 int mode)
2544{
2545 int ret = 0;
2546 struct address_space *mapping = inode->i_mapping;
2547 struct page *page;
2548 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2549 PAGE_CACHE_SHIFT;
2550
2551 struct extent_page_data epd = {
2552 .bio = NULL,
2553 .tree = tree,
2554 .get_extent = get_extent,
2555 .extent_locked = 1,
2556 };
2557 struct writeback_control wbc_writepages = {
2558 .bdi = inode->i_mapping->backing_dev_info,
2559 .sync_mode = mode,
2560 .older_than_this = NULL,
2561 .nr_to_write = nr_pages * 2,
2562 .range_start = start,
2563 .range_end = end + 1,
2564 };
2565
2566 while(start <= end) {
2567 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2568 if (clear_page_dirty_for_io(page))
2569 ret = __extent_writepage(page, &wbc_writepages, &epd);
2570 else {
2571 if (tree->ops && tree->ops->writepage_end_io_hook)
2572 tree->ops->writepage_end_io_hook(page, start,
2573 start + PAGE_CACHE_SIZE - 1,
2574 NULL, 1);
2575 unlock_page(page);
2576 }
2577 page_cache_release(page);
2578 start += PAGE_CACHE_SIZE;
2579 }
2580
2581 if (epd.bio)
2582 submit_one_bio(WRITE, epd.bio, 0, 0);
2583 return ret;
2584}
2585EXPORT_SYMBOL(extent_write_locked_range);
2586
2494 2587
2495int extent_writepages(struct extent_io_tree *tree, 2588int extent_writepages(struct extent_io_tree *tree,
2496 struct address_space *mapping, 2589 struct address_space *mapping,
@@ -2502,6 +2595,7 @@ int extent_writepages(struct extent_io_tree *tree,
2502 .bio = NULL, 2595 .bio = NULL,
2503 .tree = tree, 2596 .tree = tree,
2504 .get_extent = get_extent, 2597 .get_extent = get_extent,
2598 .extent_locked = 0,
2505 }; 2599 };
2506 2600
2507 ret = extent_write_cache_pages(tree, mapping, wbc, 2601 ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 283110ec4ee0..2d5f67065b69 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,7 +35,8 @@ typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 unsigned long bio_flags); 35 unsigned long bio_flags);
36struct extent_io_ops { 36struct extent_io_ops {
37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
38 u64 start, u64 end, int *page_started); 38 u64 start, u64 end, int *page_started,
39 unsigned long *nr_written);
39 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 40 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
40 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 41 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
41 extent_submit_bio_hook_t *submit_bio_hook; 42 extent_submit_bio_hook_t *submit_bio_hook;
@@ -172,6 +173,9 @@ int extent_invalidatepage(struct extent_io_tree *tree,
172int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 173int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
173 get_extent_t *get_extent, 174 get_extent_t *get_extent,
174 struct writeback_control *wbc); 175 struct writeback_control *wbc);
176int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
177 u64 start, u64 end, get_extent_t *get_extent,
178 int mode);
175int extent_writepages(struct extent_io_tree *tree, 179int extent_writepages(struct extent_io_tree *tree,
176 struct address_space *mapping, 180 struct address_space *mapping,
177 get_extent_t *get_extent, 181 get_extent_t *get_extent,
@@ -256,6 +260,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
256int extent_clear_unlock_delalloc(struct inode *inode, 260int extent_clear_unlock_delalloc(struct inode *inode,
257 struct extent_io_tree *tree, 261 struct extent_io_tree *tree,
258 u64 start, u64 end, struct page *locked_page, 262 u64 start, u64 end, struct page *locked_page,
259 int clear_dirty, int set_writeback, 263 int unlock_page,
260 int clear_writeback); 264 int clear_unlock,
265 int clear_delalloc, int clear_dirty,
266 int set_writeback,
267 int end_writeback);
261#endif 268#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0c8cc35a8b97..337221ecca27 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -368,6 +368,8 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
368 u64 search_start = start; 368 u64 search_start = start;
369 u64 leaf_start; 369 u64 leaf_start;
370 u64 ram_bytes = 0; 370 u64 ram_bytes = 0;
371 u64 orig_parent = 0;
372 u64 disk_bytenr = 0;
371 u8 compression; 373 u8 compression;
372 u8 encryption; 374 u8 encryption;
373 u16 other_encoding = 0; 375 u16 other_encoding = 0;
@@ -500,17 +502,31 @@ next_slot:
500 keep = 1; 502 keep = 1;
501 } 503 }
502 504
503 if (bookend && found_extent && locked_end < extent_end) { 505 if (bookend && found_extent) {
504 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 506 if (locked_end < extent_end) {
505 locked_end, extent_end - 1, GFP_NOFS); 507 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
506 if (!ret) { 508 locked_end, extent_end - 1,
507 btrfs_release_path(root, path); 509 GFP_NOFS);
508 lock_extent(&BTRFS_I(inode)->io_tree, 510 if (!ret) {
509 locked_end, extent_end - 1, GFP_NOFS); 511 btrfs_release_path(root, path);
512 lock_extent(&BTRFS_I(inode)->io_tree,
513 locked_end, extent_end - 1,
514 GFP_NOFS);
515 locked_end = extent_end;
516 continue;
517 }
510 locked_end = extent_end; 518 locked_end = extent_end;
511 continue;
512 } 519 }
513 locked_end = extent_end; 520 orig_parent = path->nodes[0]->start;
521 disk_bytenr = le64_to_cpu(old.disk_bytenr);
522 if (disk_bytenr != 0) {
523 ret = btrfs_inc_extent_ref(trans, root,
524 disk_bytenr,
525 le64_to_cpu(old.disk_num_bytes),
526 orig_parent, root->root_key.objectid,
527 trans->transid, inode->i_ino);
528 BUG_ON(ret);
529 }
514 } 530 }
515 531
516 if (found_inline) { 532 if (found_inline) {
@@ -537,8 +553,12 @@ next_slot:
537 inode_sub_bytes(inode, old_num - 553 inode_sub_bytes(inode, old_num -
538 new_num); 554 new_num);
539 } 555 }
540 btrfs_set_file_extent_num_bytes(leaf, extent, 556 if (!compression && !encryption) {
541 new_num); 557 btrfs_set_file_extent_ram_bytes(leaf,
558 extent, new_num);
559 }
560 btrfs_set_file_extent_num_bytes(leaf,
561 extent, new_num);
542 btrfs_mark_buffer_dirty(leaf); 562 btrfs_mark_buffer_dirty(leaf);
543 } else if (key.offset < inline_limit && 563 } else if (key.offset < inline_limit &&
544 (end > extent_end) && 564 (end > extent_end) &&
@@ -582,11 +602,11 @@ next_slot:
582 } 602 }
583 /* create bookend, splitting the extent in two */ 603 /* create bookend, splitting the extent in two */
584 if (bookend && found_extent) { 604 if (bookend && found_extent) {
585 u64 disk_bytenr;
586 struct btrfs_key ins; 605 struct btrfs_key ins;
587 ins.objectid = inode->i_ino; 606 ins.objectid = inode->i_ino;
588 ins.offset = end; 607 ins.offset = end;
589 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 608 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
609
590 btrfs_release_path(root, path); 610 btrfs_release_path(root, path);
591 ret = btrfs_insert_empty_item(trans, root, path, &ins, 611 ret = btrfs_insert_empty_item(trans, root, path, &ins,
592 sizeof(*extent)); 612 sizeof(*extent));
@@ -623,14 +643,13 @@ next_slot:
623 643
624 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
625 645
626 disk_bytenr = le64_to_cpu(old.disk_bytenr);
627 if (disk_bytenr != 0) { 646 if (disk_bytenr != 0) {
628 ret = btrfs_inc_extent_ref(trans, root, 647 ret = btrfs_update_extent_ref(trans, root,
629 disk_bytenr, 648 disk_bytenr, orig_parent,
630 le64_to_cpu(old.disk_num_bytes), 649 leaf->start,
631 leaf->start,
632 root->root_key.objectid, 650 root->root_key.objectid,
633 trans->transid, ins.objectid); 651 trans->transid, ins.objectid);
652
634 BUG_ON(ret); 653 BUG_ON(ret);
635 } 654 }
636 btrfs_release_path(root, path); 655 btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3df0ffad976e..e01c0d0310ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,6 +86,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
86 86
87static void btrfs_truncate(struct inode *inode); 87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
89 93
90/* 94/*
91 * a very lame attempt at stopping writes when the FS is 85% full. There 95 * a very lame attempt at stopping writes when the FS is 85% full. There
@@ -262,35 +266,72 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
262 return 0; 266 return 0;
263} 267}
264 268
269struct async_extent {
270 u64 start;
271 u64 ram_size;
272 u64 compressed_size;
273 struct page **pages;
274 unsigned long nr_pages;
275 struct list_head list;
276};
277
278struct async_cow {
279 struct inode *inode;
280 struct btrfs_root *root;
281 struct page *locked_page;
282 u64 start;
283 u64 end;
284 struct list_head extents;
285 struct btrfs_work work;
286};
287
288static noinline int add_async_extent(struct async_cow *cow,
289 u64 start, u64 ram_size,
290 u64 compressed_size,
291 struct page **pages,
292 unsigned long nr_pages)
293{
294 struct async_extent *async_extent;
295
296 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
297 async_extent->start = start;
298 async_extent->ram_size = ram_size;
299 async_extent->compressed_size = compressed_size;
300 async_extent->pages = pages;
301 async_extent->nr_pages = nr_pages;
302 list_add_tail(&async_extent->list, &cow->extents);
303 return 0;
304}
305
265/* 306/*
266 * when extent_io.c finds a delayed allocation range in the file, 307 * we create compressed extents in two phases. The first
267 * the call backs end up in this code. The basic idea is to 308 * phase compresses a range of pages that have already been
268 * allocate extents on disk for the range, and create ordered data structs 309 * locked (both pages and state bits are locked).
269 * in ram to track those extents.
270 * 310 *
271 * locked_page is the page that writepage had locked already. We use 311 * This is done inside an ordered work queue, and the compression
272 * it to make sure we don't do extra locks or unlocks. 312 * is spread across many cpus. The actual IO submission is step
313 * two, and the ordered work queue takes care of making sure that
314 * happens in the same order things were put onto the queue by
315 * writepages and friends.
273 * 316 *
274 * *page_started is set to one if we unlock locked_page and do everything 317 * If this code finds it can't get good compression, it puts an
275 * required to start IO on it. It may be clean and already done with 318 * entry onto the work queue to write the uncompressed bytes. This
276 * IO when we return. 319 * makes sure that both compressed inodes and uncompressed inodes
320 * are written in the same order that pdflush sent them down.
277 */ 321 */
278static int cow_file_range(struct inode *inode, struct page *locked_page, 322static noinline int compress_file_range(struct inode *inode,
279 u64 start, u64 end, int *page_started) 323 struct page *locked_page,
324 u64 start, u64 end,
325 struct async_cow *async_cow,
326 int *num_added)
280{ 327{
281 struct btrfs_root *root = BTRFS_I(inode)->root; 328 struct btrfs_root *root = BTRFS_I(inode)->root;
282 struct btrfs_trans_handle *trans; 329 struct btrfs_trans_handle *trans;
283 u64 alloc_hint = 0;
284 u64 num_bytes; 330 u64 num_bytes;
285 unsigned long ram_size;
286 u64 orig_start; 331 u64 orig_start;
287 u64 disk_num_bytes; 332 u64 disk_num_bytes;
288 u64 cur_alloc_size;
289 u64 blocksize = root->sectorsize; 333 u64 blocksize = root->sectorsize;
290 u64 actual_end; 334 u64 actual_end;
291 struct btrfs_key ins;
292 struct extent_map *em;
293 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
294 int ret = 0; 335 int ret = 0;
295 struct page **pages = NULL; 336 struct page **pages = NULL;
296 unsigned long nr_pages; 337 unsigned long nr_pages;
@@ -298,22 +339,12 @@ static int cow_file_range(struct inode *inode, struct page *locked_page,
298 unsigned long total_compressed = 0; 339 unsigned long total_compressed = 0;
299 unsigned long total_in = 0; 340 unsigned long total_in = 0;
300 unsigned long max_compressed = 128 * 1024; 341 unsigned long max_compressed = 128 * 1024;
301 unsigned long max_uncompressed = 256 * 1024; 342 unsigned long max_uncompressed = 128 * 1024;
302 int i; 343 int i;
303 int ordered_type;
304 int will_compress; 344 int will_compress;
305 345
306 trans = btrfs_join_transaction(root, 1);
307 BUG_ON(!trans);
308 btrfs_set_trans_block_group(trans, inode);
309 orig_start = start; 346 orig_start = start;
310 347
311 /*
312 * compression made this loop a bit ugly, but the basic idea is to
313 * compress some pages but keep the total size of the compressed
314 * extent relatively small. If compression is off, this goto target
315 * is never used.
316 */
317again: 348again:
318 will_compress = 0; 349 will_compress = 0;
319 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
@@ -324,7 +355,13 @@ again:
324 355
325 /* we want to make sure that amount of ram required to uncompress 356 /* we want to make sure that amount of ram required to uncompress
326 * an extent is reasonable, so we limit the total size in ram 357 * an extent is reasonable, so we limit the total size in ram
327 * of a compressed extent to 256k 358 * of a compressed extent to 128k. This is a crucial number
359 * because it also controls how easily we can spread reads across
360 * cpus for decompression.
361 *
362 * We also want to make sure the amount of IO required to do
363 * a random read is reasonably small, so we limit the size of
364 * a compressed extent to 128k.
328 */ 365 */
329 total_compressed = min(total_compressed, max_uncompressed); 366 total_compressed = min(total_compressed, max_uncompressed);
330 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 367 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -333,18 +370,16 @@ again:
333 total_in = 0; 370 total_in = 0;
334 ret = 0; 371 ret = 0;
335 372
336 /* we do compression for mount -o compress and when the 373 /*
337 * inode has not been flagged as nocompress 374 * we do compression for mount -o compress and when the
375 * inode has not been flagged as nocompress. This flag can
376 * change at any time if we discover bad compression ratios.
338 */ 377 */
339 if (!btrfs_test_flag(inode, NOCOMPRESS) && 378 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
340 btrfs_test_opt(root, COMPRESS)) { 379 btrfs_test_opt(root, COMPRESS)) {
341 WARN_ON(pages); 380 WARN_ON(pages);
342 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 381 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
343 382
344 /* we want to make sure the amount of IO required to satisfy
345 * a random read is reasonably small, so we limit the size
346 * of a compressed extent to 128k
347 */
348 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 383 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
349 total_compressed, pages, 384 total_compressed, pages,
350 nr_pages, &nr_pages_ret, 385 nr_pages, &nr_pages_ret,
@@ -371,26 +406,34 @@ again:
371 } 406 }
372 } 407 }
373 if (start == 0) { 408 if (start == 0) {
409 trans = btrfs_join_transaction(root, 1);
410 BUG_ON(!trans);
411 btrfs_set_trans_block_group(trans, inode);
412
374 /* lets try to make an inline extent */ 413 /* lets try to make an inline extent */
375 if (ret || total_in < (end - start + 1)) { 414 if (ret || total_in < (actual_end - start)) {
376 /* we didn't compress the entire range, try 415 /* we didn't compress the entire range, try
377 * to make an uncompressed inline extent. This 416 * to make an uncompressed inline extent.
378 * is almost sure to fail, but maybe inline sizes
379 * will get bigger later
380 */ 417 */
381 ret = cow_file_range_inline(trans, root, inode, 418 ret = cow_file_range_inline(trans, root, inode,
382 start, end, 0, NULL); 419 start, end, 0, NULL);
383 } else { 420 } else {
421 /* try making a compressed inline extent */
384 ret = cow_file_range_inline(trans, root, inode, 422 ret = cow_file_range_inline(trans, root, inode,
385 start, end, 423 start, end,
386 total_compressed, pages); 424 total_compressed, pages);
387 } 425 }
426 btrfs_end_transaction(trans, root);
388 if (ret == 0) { 427 if (ret == 0) {
428 /*
429 * inline extent creation worked, we don't need
430 * to create any more async work items. Unlock
431 * and free up our temp pages.
432 */
389 extent_clear_unlock_delalloc(inode, 433 extent_clear_unlock_delalloc(inode,
390 &BTRFS_I(inode)->io_tree, 434 &BTRFS_I(inode)->io_tree,
391 start, end, NULL, 435 start, end, NULL, 1, 0,
392 1, 1, 1); 436 0, 1, 1, 1);
393 *page_started = 1;
394 ret = 0; 437 ret = 0;
395 goto free_pages_out; 438 goto free_pages_out;
396 } 439 }
@@ -435,53 +478,280 @@ again:
435 /* flag the file so we don't compress in the future */ 478 /* flag the file so we don't compress in the future */
436 btrfs_set_flag(inode, NOCOMPRESS); 479 btrfs_set_flag(inode, NOCOMPRESS);
437 } 480 }
481 if (will_compress) {
482 *num_added += 1;
438 483
439 BUG_ON(disk_num_bytes > 484 /* the async work queues will take care of doing actual
440 btrfs_super_total_bytes(&root->fs_info->super_copy)); 485 * allocation on disk for these compressed pages,
486 * and will submit them to the elevator.
487 */
488 add_async_extent(async_cow, start, num_bytes,
489 total_compressed, pages, nr_pages_ret);
441 490
442 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 491 if (start + num_bytes < end) {
492 start += num_bytes;
493 pages = NULL;
494 cond_resched();
495 goto again;
496 }
497 } else {
498 /*
499 * No compression, but we still need to write the pages in
500 * the file we've been given so far. redirty the locked
501 * page if it corresponds to our extent and set things up
502 * for the async work queue to run cow_file_range to do
503 * the normal delalloc dance
504 */
505 if (page_offset(locked_page) >= start &&
506 page_offset(locked_page) <= end) {
507 __set_page_dirty_nobuffers(locked_page);
508 /* unlocked later on in the async handlers */
509 }
510 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
511 *num_added += 1;
512 }
443 513
444 while(disk_num_bytes > 0) { 514out:
445 unsigned long min_bytes; 515 return 0;
516
517free_pages_out:
518 for (i = 0; i < nr_pages_ret; i++) {
519 WARN_ON(pages[i]->mapping);
520 page_cache_release(pages[i]);
521 }
522 if (pages)
523 kfree(pages);
524
525 goto out;
526}
527
528/*
529 * phase two of compressed writeback. This is the ordered portion
530 * of the code, which only gets called in the order the work was
531 * queued. We walk all the async extents created by compress_file_range
532 * and send them down to the disk.
533 */
534static noinline int submit_compressed_extents(struct inode *inode,
535 struct async_cow *async_cow)
536{
537 struct async_extent *async_extent;
538 u64 alloc_hint = 0;
539 struct btrfs_trans_handle *trans;
540 struct btrfs_key ins;
541 struct extent_map *em;
542 struct btrfs_root *root = BTRFS_I(inode)->root;
543 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
544 struct extent_io_tree *io_tree;
545 int ret;
546
547 if (list_empty(&async_cow->extents))
548 return 0;
549
550 trans = btrfs_join_transaction(root, 1);
551
552 while(!list_empty(&async_cow->extents)) {
553 async_extent = list_entry(async_cow->extents.next,
554 struct async_extent, list);
555 list_del(&async_extent->list);
446 556
557 io_tree = &BTRFS_I(inode)->io_tree;
558
559 /* did the compression code fall back to uncompressed IO? */
560 if (!async_extent->pages) {
561 int page_started = 0;
562 unsigned long nr_written = 0;
563
564 lock_extent(io_tree, async_extent->start,
565 async_extent->start + async_extent->ram_size - 1,
566 GFP_NOFS);
567
568 /* allocate blocks */
569 cow_file_range(inode, async_cow->locked_page,
570 async_extent->start,
571 async_extent->start +
572 async_extent->ram_size - 1,
573 &page_started, &nr_written, 0);
574
575 /*
576 * if page_started, cow_file_range inserted an
577 * inline extent and took care of all the unlocking
578 * and IO for us. Otherwise, we need to submit
579 * all those pages down to the drive.
580 */
581 if (!page_started)
582 extent_write_locked_range(io_tree,
583 inode, async_extent->start,
584 async_extent->start +
585 async_extent->ram_size - 1,
586 btrfs_get_extent,
587 WB_SYNC_ALL);
588 kfree(async_extent);
589 cond_resched();
590 continue;
591 }
592
593 lock_extent(io_tree, async_extent->start,
594 async_extent->start + async_extent->ram_size - 1,
595 GFP_NOFS);
447 /* 596 /*
448 * the max size of a compressed extent is pretty small, 597 * here we're doing allocation and writeback of the
449 * make the code a little less complex by forcing 598 * compressed pages
450 * the allocator to find a whole compressed extent at once
451 */ 599 */
452 if (will_compress) 600 btrfs_drop_extent_cache(inode, async_extent->start,
453 min_bytes = disk_num_bytes; 601 async_extent->start +
454 else 602 async_extent->ram_size - 1, 0);
455 min_bytes = root->sectorsize; 603
604 ret = btrfs_reserve_extent(trans, root,
605 async_extent->compressed_size,
606 async_extent->compressed_size,
607 0, alloc_hint,
608 (u64)-1, &ins, 1);
609 BUG_ON(ret);
610 em = alloc_extent_map(GFP_NOFS);
611 em->start = async_extent->start;
612 em->len = async_extent->ram_size;
613
614 em->block_start = ins.objectid;
615 em->block_len = ins.offset;
616 em->bdev = root->fs_info->fs_devices->latest_bdev;
617 set_bit(EXTENT_FLAG_PINNED, &em->flags);
618 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
619
620 while(1) {
621 spin_lock(&em_tree->lock);
622 ret = add_extent_mapping(em_tree, em);
623 spin_unlock(&em_tree->lock);
624 if (ret != -EEXIST) {
625 free_extent_map(em);
626 break;
627 }
628 btrfs_drop_extent_cache(inode, async_extent->start,
629 async_extent->start +
630 async_extent->ram_size - 1, 0);
631 }
632
633 ret = btrfs_add_ordered_extent(inode, async_extent->start,
634 ins.objectid,
635 async_extent->ram_size,
636 ins.offset,
637 BTRFS_ORDERED_COMPRESSED);
638 BUG_ON(ret);
639
640 btrfs_end_transaction(trans, root);
641
642 /*
643 * clear dirty, set writeback and unlock the pages.
644 */
645 extent_clear_unlock_delalloc(inode,
646 &BTRFS_I(inode)->io_tree,
647 async_extent->start,
648 async_extent->start +
649 async_extent->ram_size - 1,
650 NULL, 1, 1, 0, 1, 1, 0);
651
652 ret = btrfs_submit_compressed_write(inode,
653 async_extent->start,
654 async_extent->ram_size,
655 ins.objectid,
656 ins.offset, async_extent->pages,
657 async_extent->nr_pages);
658
659 BUG_ON(ret);
660 trans = btrfs_join_transaction(root, 1);
661 alloc_hint = ins.objectid + ins.offset;
662 kfree(async_extent);
663 cond_resched();
664 }
665
666 btrfs_end_transaction(trans, root);
667 return 0;
668}
669
670/*
671 * when extent_io.c finds a delayed allocation range in the file,
672 * the call backs end up in this code. The basic idea is to
673 * allocate extents on disk for the range, and create ordered data structs
674 * in ram to track those extents.
675 *
676 * locked_page is the page that writepage had locked already. We use
677 * it to make sure we don't do extra locks or unlocks.
678 *
679 * *page_started is set to one if we unlock locked_page and do everything
680 * required to start IO on it. It may be clean and already done with
681 * IO when we return.
682 */
683static noinline int cow_file_range(struct inode *inode,
684 struct page *locked_page,
685 u64 start, u64 end, int *page_started,
686 unsigned long *nr_written,
687 int unlock)
688{
689 struct btrfs_root *root = BTRFS_I(inode)->root;
690 struct btrfs_trans_handle *trans;
691 u64 alloc_hint = 0;
692 u64 num_bytes;
693 unsigned long ram_size;
694 u64 disk_num_bytes;
695 u64 cur_alloc_size;
696 u64 blocksize = root->sectorsize;
697 u64 actual_end;
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
456 706
707 actual_end = min_t(u64, i_size_read(inode), end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while(disk_num_bytes > 0) {
457 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
458 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
459 min_bytes, 0, alloc_hint, 739 root->sectorsize, 0, alloc_hint,
460 (u64)-1, &ins, 1); 740 (u64)-1, &ins, 1);
461 if (ret) { 741 if (ret) {
462 WARN_ON(1); 742 BUG();
463 goto free_pages_out_fail;
464 } 743 }
465 em = alloc_extent_map(GFP_NOFS); 744 em = alloc_extent_map(GFP_NOFS);
466 em->start = start; 745 em->start = start;
467 746
468 if (will_compress) { 747 ram_size = ins.offset;
469 ram_size = num_bytes; 748 em->len = ins.offset;
470 em->len = num_bytes;
471 } else {
472 /* ramsize == disk size */
473 ram_size = ins.offset;
474 em->len = ins.offset;
475 }
476 749
477 em->block_start = ins.objectid; 750 em->block_start = ins.objectid;
478 em->block_len = ins.offset; 751 em->block_len = ins.offset;
479 em->bdev = root->fs_info->fs_devices->latest_bdev; 752 em->bdev = root->fs_info->fs_devices->latest_bdev;
480 set_bit(EXTENT_FLAG_PINNED, &em->flags); 753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
481 754
482 if (will_compress)
483 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
484
485 while(1) { 755 while(1) {
486 spin_lock(&em_tree->lock); 756 spin_lock(&em_tree->lock);
487 ret = add_extent_mapping(em_tree, em); 757 ret = add_extent_mapping(em_tree, em);
@@ -495,10 +765,8 @@ again:
495 } 765 }
496 766
497 cur_alloc_size = ins.offset; 767 cur_alloc_size = ins.offset;
498 ordered_type = will_compress ? BTRFS_ORDERED_COMPRESSED : 0;
499 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
500 ram_size, cur_alloc_size, 769 ram_size, cur_alloc_size, 0);
501 ordered_type);
502 BUG_ON(ret); 770 BUG_ON(ret);
503 771
504 if (disk_num_bytes < cur_alloc_size) { 772 if (disk_num_bytes < cur_alloc_size) {
@@ -506,82 +774,145 @@ again:
506 cur_alloc_size); 774 cur_alloc_size);
507 break; 775 break;
508 } 776 }
509
510 if (will_compress) {
511 /*
512 * we're doing compression, we and we need to
513 * submit the compressed extents down to the device.
514 *
515 * We lock down all the file pages, clearing their
516 * dirty bits and setting them writeback. Everyone
517 * that wants to modify the page will wait on the
518 * ordered extent above.
519 *
520 * The writeback bits on the file pages are
521 * cleared when the compressed pages are on disk
522 */
523 btrfs_end_transaction(trans, root);
524
525 if (start <= page_offset(locked_page) &&
526 page_offset(locked_page) < start + ram_size) {
527 *page_started = 1;
528 }
529
530 extent_clear_unlock_delalloc(inode,
531 &BTRFS_I(inode)->io_tree,
532 start,
533 start + ram_size - 1,
534 NULL, 1, 1, 0);
535
536 ret = btrfs_submit_compressed_write(inode, start,
537 ram_size, ins.objectid,
538 cur_alloc_size, pages,
539 nr_pages_ret);
540
541 BUG_ON(ret);
542 trans = btrfs_join_transaction(root, 1);
543 if (start + ram_size < end) {
544 start += ram_size;
545 alloc_hint = ins.objectid + ins.offset;
546 /* pages will be freed at end_bio time */
547 pages = NULL;
548 goto again;
549 } else {
550 /* we've written everything, time to go */
551 break;
552 }
553 }
554 /* we're not doing compressed IO, don't unlock the first 777 /* we're not doing compressed IO, don't unlock the first
555 * page (which the caller expects to stay locked), don't 778 * page (which the caller expects to stay locked), don't
556 * clear any dirty bits and don't set any writeback bits 779 * clear any dirty bits and don't set any writeback bits
557 */ 780 */
558 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 781 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
559 start, start + ram_size - 1, 782 start, start + ram_size - 1,
560 locked_page, 0, 0, 0); 783 locked_page, unlock, 1,
784 1, 0, 0, 0);
561 disk_num_bytes -= cur_alloc_size; 785 disk_num_bytes -= cur_alloc_size;
562 num_bytes -= cur_alloc_size; 786 num_bytes -= cur_alloc_size;
563 alloc_hint = ins.objectid + ins.offset; 787 alloc_hint = ins.objectid + ins.offset;
564 start += cur_alloc_size; 788 start += cur_alloc_size;
565 } 789 }
566
567 ret = 0;
568out: 790out:
791 ret = 0;
569 btrfs_end_transaction(trans, root); 792 btrfs_end_transaction(trans, root);
570 793
571 return ret; 794 return ret;
795}
572 796
573free_pages_out_fail: 797/*
574 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 798 * work queue call back to started compression on a file and pages
575 start, end, locked_page, 0, 0, 0); 799 */
576free_pages_out: 800static noinline void async_cow_start(struct btrfs_work *work)
577 for (i = 0; i < nr_pages_ret; i++) { 801{
578 WARN_ON(pages[i]->mapping); 802 struct async_cow *async_cow;
579 page_cache_release(pages[i]); 803 int num_added = 0;
804 async_cow = container_of(work, struct async_cow, work);
805
806 compress_file_range(async_cow->inode, async_cow->locked_page,
807 async_cow->start, async_cow->end, async_cow,
808 &num_added);
809 if (num_added == 0)
810 async_cow->inode = NULL;
811}
812
813/*
814 * work queue call back to submit previously compressed pages
815 */
816static noinline void async_cow_submit(struct btrfs_work *work)
817{
818 struct async_cow *async_cow;
819 struct btrfs_root *root;
820 unsigned long nr_pages;
821
822 async_cow = container_of(work, struct async_cow, work);
823
824 root = async_cow->root;
825 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
826 PAGE_CACHE_SHIFT;
827
828 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
829
830 if (atomic_read(&root->fs_info->async_delalloc_pages) <
831 5 * 1042 * 1024 &&
832 waitqueue_active(&root->fs_info->async_submit_wait))
833 wake_up(&root->fs_info->async_submit_wait);
834
835 if (async_cow->inode) {
836 submit_compressed_extents(async_cow->inode, async_cow);
580 } 837 }
581 if (pages) 838}
582 kfree(pages);
583 839
584 goto out; 840static noinline void async_cow_free(struct btrfs_work *work)
841{
842 struct async_cow *async_cow;
843 async_cow = container_of(work, struct async_cow, work);
844 kfree(async_cow);
845}
846
847static int cow_file_range_async(struct inode *inode, struct page *locked_page,
848 u64 start, u64 end, int *page_started,
849 unsigned long *nr_written)
850{
851 struct async_cow *async_cow;
852 struct btrfs_root *root = BTRFS_I(inode)->root;
853 unsigned long nr_pages;
854 u64 cur_end;
855 int limit = 10 * 1024 * 1042;
856
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while(start < end) {
865 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
866 async_cow->inode = inode;
867 async_cow->root = root;
868 async_cow->locked_page = locked_page;
869 async_cow->start = start;
870
871 if (btrfs_test_flag(inode, NOCOMPRESS))
872 cur_end = end;
873 else
874 cur_end = min(end, start + 512 * 1024 - 1);
875
876 async_cow->end = cur_end;
877 INIT_LIST_HEAD(&async_cow->extents);
878
879 async_cow->work.func = async_cow_start;
880 async_cow->work.ordered_func = async_cow_submit;
881 async_cow->work.ordered_free = async_cow_free;
882 async_cow->work.flags = 0;
883
884 while(atomic_read(&root->fs_info->async_submit_draining) &&
885 atomic_read(&root->fs_info->async_delalloc_pages)) {
886 wait_event(root->fs_info->async_submit_wait,
887 (atomic_read(&root->fs_info->async_delalloc_pages)
888 == 0));
889 }
890
891 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
892 PAGE_CACHE_SHIFT;
893 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
894
895 btrfs_queue_worker(&root->fs_info->delalloc_workers,
896 &async_cow->work);
897
898 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
899 wait_event(root->fs_info->async_submit_wait,
900 (atomic_read(&root->fs_info->async_delalloc_pages) <
901 limit));
902 }
903
904 while(atomic_read(&root->fs_info->async_submit_draining) &&
905 atomic_read(&root->fs_info->async_delalloc_pages)) {
906 wait_event(root->fs_info->async_submit_wait,
907 (atomic_read(&root->fs_info->async_delalloc_pages) ==
908 0));
909 }
910
911 *nr_written += nr_pages;
912 start = cur_end + 1;
913 }
914 *page_started = 1;
915 return 0;
585} 916}
586 917
587/* 918/*
@@ -592,7 +923,8 @@ free_pages_out:
592 * blocks on disk 923 * blocks on disk
593 */ 924 */
594static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 925static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
595 u64 start, u64 end, int *page_started, int force) 926 u64 start, u64 end, int *page_started, int force,
927 unsigned long *nr_written)
596{ 928{
597 struct btrfs_root *root = BTRFS_I(inode)->root; 929 struct btrfs_root *root = BTRFS_I(inode)->root;
598 struct btrfs_trans_handle *trans; 930 struct btrfs_trans_handle *trans;
@@ -711,7 +1043,8 @@ out_check:
711 btrfs_release_path(root, path); 1043 btrfs_release_path(root, path);
712 if (cow_start != (u64)-1) { 1044 if (cow_start != (u64)-1) {
713 ret = cow_file_range(inode, locked_page, cow_start, 1045 ret = cow_file_range(inode, locked_page, cow_start,
714 found_key.offset - 1, page_started); 1046 found_key.offset - 1, page_started,
1047 nr_written, 1);
715 BUG_ON(ret); 1048 BUG_ON(ret);
716 cow_start = (u64)-1; 1049 cow_start = (u64)-1;
717 } 1050 }
@@ -748,9 +1081,10 @@ out_check:
748 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1081 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
749 num_bytes, num_bytes, type); 1082 num_bytes, num_bytes, type);
750 BUG_ON(ret); 1083 BUG_ON(ret);
1084
751 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1085 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
752 cur_offset, cur_offset + num_bytes - 1, 1086 cur_offset, cur_offset + num_bytes - 1,
753 locked_page, 0, 0, 0); 1087 locked_page, 1, 1, 1, 0, 0, 0);
754 cur_offset = extent_end; 1088 cur_offset = extent_end;
755 if (cur_offset > end) 1089 if (cur_offset > end)
756 break; 1090 break;
@@ -761,7 +1095,7 @@ out_check:
761 cow_start = cur_offset; 1095 cow_start = cur_offset;
762 if (cow_start != (u64)-1) { 1096 if (cow_start != (u64)-1) {
763 ret = cow_file_range(inode, locked_page, cow_start, end, 1097 ret = cow_file_range(inode, locked_page, cow_start, end,
764 page_started); 1098 page_started, nr_written, 1);
765 BUG_ON(ret); 1099 BUG_ON(ret);
766 } 1100 }
767 1101
@@ -775,7 +1109,8 @@ out_check:
775 * extent_io.c call back to do delayed allocation processing 1109 * extent_io.c call back to do delayed allocation processing
776 */ 1110 */
777static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1111static int run_delalloc_range(struct inode *inode, struct page *locked_page,
778 u64 start, u64 end, int *page_started) 1112 u64 start, u64 end, int *page_started,
1113 unsigned long *nr_written)
779{ 1114{
780 struct btrfs_root *root = BTRFS_I(inode)->root; 1115 struct btrfs_root *root = BTRFS_I(inode)->root;
781 int ret; 1116 int ret;
@@ -783,13 +1118,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
783 if (btrfs_test_opt(root, NODATACOW) || 1118 if (btrfs_test_opt(root, NODATACOW) ||
784 btrfs_test_flag(inode, NODATACOW)) 1119 btrfs_test_flag(inode, NODATACOW))
785 ret = run_delalloc_nocow(inode, locked_page, start, end, 1120 ret = run_delalloc_nocow(inode, locked_page, start, end,
786 page_started, 0); 1121 page_started, 0, nr_written);
787 else if (btrfs_test_flag(inode, PREALLOC)) 1122 else if (btrfs_test_flag(inode, PREALLOC))
788 ret = run_delalloc_nocow(inode, locked_page, start, end, 1123 ret = run_delalloc_nocow(inode, locked_page, start, end,
789 page_started, 1); 1124 page_started, 1, nr_written);
790 else 1125 else
791 ret = cow_file_range(inode, locked_page, start, end, 1126 ret = cow_file_range_async(inode, locked_page, start, end,
792 page_started); 1127 page_started, nr_written);
793 1128
794 return ret; 1129 return ret;
795} 1130}
@@ -861,6 +1196,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
861 u64 map_length; 1196 u64 map_length;
862 int ret; 1197 int ret;
863 1198
1199 if (bio_flags & EXTENT_BIO_COMPRESSED)
1200 return 0;
1201
864 length = bio->bi_size; 1202 length = bio->bi_size;
865 map_tree = &root->fs_info->mapping_tree; 1203 map_tree = &root->fs_info->mapping_tree;
866 map_length = length; 1204 map_length = length;
@@ -925,12 +1263,12 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
925 btrfs_test_flag(inode, NODATASUM); 1263 btrfs_test_flag(inode, NODATASUM);
926 1264
927 if (!(rw & (1 << BIO_RW))) { 1265 if (!(rw & (1 << BIO_RW))) {
928 if (!skip_sum)
929 btrfs_lookup_bio_sums(root, inode, bio);
930 1266
931 if (bio_flags & EXTENT_BIO_COMPRESSED) 1267 if (bio_flags & EXTENT_BIO_COMPRESSED)
932 return btrfs_submit_compressed_read(inode, bio, 1268 return btrfs_submit_compressed_read(inode, bio,
933 mirror_num, bio_flags); 1269 mirror_num, bio_flags);
1270 else if (!skip_sum)
1271 btrfs_lookup_bio_sums(root, inode, bio);
934 goto mapit; 1272 goto mapit;
935 } else if (!skip_sum) { 1273 } else if (!skip_sum) {
936 /* we're doing a write, do the async checksumming */ 1274 /* we're doing a write, do the async checksumming */
@@ -966,6 +1304,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
966 1304
967int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1305int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
968{ 1306{
1307 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
1308 WARN_ON(1);
1309 }
969 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1310 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
970 GFP_NOFS); 1311 GFP_NOFS);
971} 1312}
@@ -2105,6 +2446,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2105 int pending_del_nr = 0; 2446 int pending_del_nr = 0;
2106 int pending_del_slot = 0; 2447 int pending_del_slot = 0;
2107 int extent_type = -1; 2448 int extent_type = -1;
2449 int encoding;
2108 u64 mask = root->sectorsize - 1; 2450 u64 mask = root->sectorsize - 1;
2109 2451
2110 if (root->ref_cows) 2452 if (root->ref_cows)
@@ -2144,6 +2486,7 @@ search_again:
2144 leaf = path->nodes[0]; 2486 leaf = path->nodes[0];
2145 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2487 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2146 found_type = btrfs_key_type(&found_key); 2488 found_type = btrfs_key_type(&found_key);
2489 encoding = 0;
2147 2490
2148 if (found_key.objectid != inode->i_ino) 2491 if (found_key.objectid != inode->i_ino)
2149 break; 2492 break;
@@ -2156,6 +2499,10 @@ search_again:
2156 fi = btrfs_item_ptr(leaf, path->slots[0], 2499 fi = btrfs_item_ptr(leaf, path->slots[0],
2157 struct btrfs_file_extent_item); 2500 struct btrfs_file_extent_item);
2158 extent_type = btrfs_file_extent_type(leaf, fi); 2501 extent_type = btrfs_file_extent_type(leaf, fi);
2502 encoding = btrfs_file_extent_compression(leaf, fi);
2503 encoding |= btrfs_file_extent_encryption(leaf, fi);
2504 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2505
2159 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2506 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2160 item_end += 2507 item_end +=
2161 btrfs_file_extent_num_bytes(leaf, fi); 2508 btrfs_file_extent_num_bytes(leaf, fi);
@@ -2200,7 +2547,7 @@ search_again:
2200 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2547 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2201 u64 num_dec; 2548 u64 num_dec;
2202 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 2549 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2203 if (!del_item) { 2550 if (!del_item && !encoding) {
2204 u64 orig_num_bytes = 2551 u64 orig_num_bytes =
2205 btrfs_file_extent_num_bytes(leaf, fi); 2552 btrfs_file_extent_num_bytes(leaf, fi);
2206 extent_num_bytes = new_size - 2553 extent_num_bytes = new_size -
@@ -2436,7 +2783,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2436 last_byte = min(extent_map_end(em), block_end); 2783 last_byte = min(extent_map_end(em), block_end);
2437 last_byte = (last_byte + mask) & ~mask; 2784 last_byte = (last_byte + mask) & ~mask;
2438 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2785 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2786 u64 hint_byte = 0;
2439 hole_size = last_byte - cur_offset; 2787 hole_size = last_byte - cur_offset;
2788 err = btrfs_drop_extents(trans, root, inode,
2789 cur_offset,
2790 cur_offset + hole_size,
2791 cur_offset, &hint_byte);
2792 if (err)
2793 break;
2440 err = btrfs_insert_file_extent(trans, root, 2794 err = btrfs_insert_file_extent(trans, root,
2441 inode->i_ino, cur_offset, 0, 2795 inode->i_ino, cur_offset, 0,
2442 0, hole_size, 0, hole_size, 2796 0, hole_size, 0, hole_size,
@@ -3785,6 +4139,7 @@ int btrfs_writepages(struct address_space *mapping,
3785 struct writeback_control *wbc) 4139 struct writeback_control *wbc)
3786{ 4140{
3787 struct extent_io_tree *tree; 4141 struct extent_io_tree *tree;
4142
3788 tree = &BTRFS_I(mapping->host)->io_tree; 4143 tree = &BTRFS_I(mapping->host)->io_tree;
3789 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 4144 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
3790} 4145}
@@ -4285,9 +4640,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4285 * ordered extents get created before we return 4640 * ordered extents get created before we return
4286 */ 4641 */
4287 atomic_inc(&root->fs_info->async_submit_draining); 4642 atomic_inc(&root->fs_info->async_submit_draining);
4288 while(atomic_read(&root->fs_info->nr_async_submits)) { 4643 while(atomic_read(&root->fs_info->nr_async_submits) ||
4644 atomic_read(&root->fs_info->async_delalloc_pages)) {
4289 wait_event(root->fs_info->async_submit_wait, 4645 wait_event(root->fs_info->async_submit_wait,
4290 (atomic_read(&root->fs_info->nr_async_submits) == 0)); 4646 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4647 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4291 } 4648 }
4292 atomic_dec(&root->fs_info->async_submit_draining); 4649 atomic_dec(&root->fs_info->async_submit_draining);
4293 return 0; 4650 return 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 370bb4285597..027ad6b3839e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -390,7 +390,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
390 * start IO on any dirty ones so the wait doesn't stall waiting 390 * start IO on any dirty ones so the wait doesn't stall waiting
391 * for pdflush to find them 391 * for pdflush to find them
392 */ 392 */
393 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); 393 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
394 if (wait) { 394 if (wait) {
395 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 395 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
396 &entry->flags)); 396 &entry->flags));
@@ -421,6 +421,12 @@ again:
421 */ 421 */
422 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 422 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
423 423
424 /* The compression code will leave pages locked but return from
425 * writepage without setting the page writeback. Starting again
426 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
427 */
428 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
429
424 btrfs_wait_on_page_writeback_range(inode->i_mapping, 430 btrfs_wait_on_page_writeback_range(inode->i_mapping,
425 start >> PAGE_CACHE_SHIFT, 431 start >> PAGE_CACHE_SHIFT,
426 orig_end >> PAGE_CACHE_SHIFT); 432 orig_end >> PAGE_CACHE_SHIFT);
@@ -448,10 +454,7 @@ again:
448 } 454 }
449 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, 455 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
450 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { 456 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
451 printk("inode %lu still ordered or delalloc after wait " 457 schedule_timeout(1);
452 "%llu %llu\n", inode->i_ino,
453 (unsigned long long)start,
454 (unsigned long long)orig_end);
455 goto again; 458 goto again;
456 } 459 }
457 return 0; 460 return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 431fdf144b58..ab9d5e89ed13 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -375,6 +375,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
375 filemap_flush(root->fs_info->btree_inode->i_mapping); 375 filemap_flush(root->fs_info->btree_inode->i_mapping);
376 return 0; 376 return 0;
377 } 377 }
378
379 btrfs_start_delalloc_inodes(root);
380 btrfs_wait_ordered_extents(root, 0);
381
378 btrfs_clean_old_snapshots(root); 382 btrfs_clean_old_snapshots(root);
379 trans = btrfs_start_transaction(root, 1); 383 trans = btrfs_start_transaction(root, 1);
380 ret = btrfs_commit_transaction(trans, root); 384 ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index e99309180a11..ba2527d08734 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -423,8 +423,9 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
423 /* we didn't make progress in this inflate 423 /* we didn't make progress in this inflate
424 * call, we're done 424 * call, we're done
425 */ 425 */
426 if (ret != Z_STREAM_END) 426 if (ret != Z_STREAM_END) {
427 ret = -1; 427 ret = -1;
428 }
428 break; 429 break;
429 } 430 }
430 431